library(tidyverse)
library(ggplot2)
library(ggalluvial)
library(biomaRt)
library(ggdendro)
library(pcaMethods)
library(uwot)
library(pheatmap)
library(ggplotify)
library (ggrepel)
library(ggraph)
library(patchwork)



select <- dplyr::select
tmm_sample <- read_csv("./data/final_data/curated_pTMM_rattus_norvegicus_v103.csv")
Rows: 22245 Columns: 353── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr   (1): target_id
dbl (352): ventral.forebrain_male.3, ventral.forebrain_male.2, ventral.forebrain_female.3, ventral.forebrain_female...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
metadata <- read_csv("./data/final_data/curated_metadata.csv")
Rows: 352 Columns: 13── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (11): ID, Acronym, sex, tissue_name, region_tissue_name, consensus_tissue_name, organ_name, tissue_color, regio...
dbl  (1): rat_n
lgl  (1): regional_tissue
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all(sort(colnames(tmm_sample[-1])) == sort(metadata$ID))
[1] TRUE

#Generate hierarchical data



#slow, but understanable
tmm_tissue <- tmm_sample %>% 
  gather(sample, tmm, -1) %>% 
  left_join(metadata %>% select(ID,tissue_name), by = c("sample" = "ID")) %>% 
  group_by(target_id, tissue_name) %>% 
  summarize(tmm = mean(tmm)) %>% 
  ungroup()
`summarise()` has grouped output by 'target_id'. You can override using the `.groups` argument.
write_csv(tmm_tissue, file="./data/final_data/final_tmm_tissue_name.csv")

tmm_region_tissue <- tmm_tissue %>% 
  left_join(metadata %>% select(tissue_name,region_tissue_name), by = c("tissue_name" = "tissue_name")) %>% 
  group_by(target_id, region_tissue_name) %>% 
  summarize(tmm = max(tmm)) %>% 
  ungroup()
`summarise()` has grouped output by 'target_id'. You can override using the `.groups` argument.
write_csv(tmm_region_tissue, file="./data/final_data/final_tmm_region_tissue_name.csv")

tmm_consensus_tissue <- tmm_region_tissue %>% 
  left_join(metadata %>% select(region_tissue_name,consensus_tissue_name), by = c("region_tissue_name" = "region_tissue_name")) %>% 
  group_by(target_id, consensus_tissue_name) %>% 
  summarize(tmm = max(tmm)) %>% 
  ungroup()
`summarise()` has grouped output by 'target_id'. You can override using the `.groups` argument.
write_csv(tmm_consensus_tissue, file="./data/final_data/final_tmm_consensus_name.csv")

#check
all(metadata$tissue_name %>% unique() %>% sort() == tmm_tissue$tissue_name %>% unique() %>% sort())
[1] TRUE
all(metadata$region_tissue_name %>% unique() %>% sort() == tmm_region_tissue$region_tissue_name %>% unique() %>% sort())
[1] TRUE
all(metadata$consensus_tissue_name %>% unique() %>% sort() == tmm_consensus_tissue$consensus_tissue_name %>% unique() %>% sort())
[1] TRUE
# # faster, but less understandable
# unique_tissue <- select(metadata,tissue_name) %>%
#   distinct()
# tmm_by_tissue <- tibble(target_id = tmm_sample$target_id)
# for (i in unique_tissue$tissue_name) {
#   curent_tissue_meta = metadata[metadata$tissue_name == i,]
#   current_tmm = select(tmm_sample,c(target_id, curent_tissue_meta$ID))
#   means <- current_tmm %>% select(curent_tissue_meta$ID) %>% rowMeans()
#   tmm_by_tissue[toString(i)] <- means
# }
# #tmm_by_tissue <- tmm_by_tissue %>% gather(tissue_name, tmm, -1)
# #write.csv(tmm_by_tissue, file="tmm_tissue_name.csv", row.names = FALSE)
# 
# unique_region <- select(metadata,region_tissue_name) %>%
#   distinct()
# tmm_by_region <- tibble(target_id = tmm_sample$target_id)
# for (i in unique_region$region_tissue_name) {
#   current_region_meta = metadata[metadata$region_tissue_name == i,]
#   current_tmm = select(tmm_by_tissue, c(target_id, unique(current_region_meta$tissue_name)))
#   max <- select(current_tmm,-target_id) %>% apply(1,max)
#   tmm_by_region[toString(i)] <- max
# }
# 
# # write.csv(tmm_by_region, file="tmm_region_name.csv", row.names = FALSE)
# 
# unique_consensus <- select(metadata,consensus_tissue_name) %>%
#   distinct()
# tmm_by_consensus <- tibble(target_id = tmm_sample$target_id)
# for (i in unique_consensus$consensus_tissue_name) {
#   current_consensus_meta = metadata[metadata$consensus_tissue_name == i,]
#   current_tmm = select(tmm_by_region, c(target_id, unique(current_consensus_meta$region_tissue_name)))
#   max <- select(current_tmm,-target_id) %>% apply(1,max)
#   tm_by_consensus[toString(i)] <- max
# }

#write.csv(tmm_by_consensus, file="tmm_consensus_name.csv", row.names = FALSE)

#Figure 1 ##Figure 1A - Hierarchy Overview

tissue_colors_palette_full <- rbind(
  metadata %>% select(name = tissue_name, color = tissue_color), 
  metadata %>% select(name = consensus_tissue_name, color = consensus_tissue_color),
  metadata %>% select(name = organ_name, color = organ_color)
) %>% distinct() %>% 
  mutate(name = str_to_sentence(name)) %>% arrange(name)

pal <- tissue_colors_palette_full$color
pal <- set_names(pal,tissue_colors_palette_full$name )


plot_data1 <- 
  metadata %>%
  select(tissue_name, region_tissue_name, consensus_tissue_name, organ_name) %>% #,
         #tissue_color, region_tissue_color, consensus_tissue_color, organ_color) %>% 
  mutate(tissue_name = str_to_sentence(tissue_name), region_tissue_name = str_to_sentence(region_tissue_name), consensus_tissue_name = str_to_sentence(consensus_tissue_name), organ_name = str_to_sentence( organ_name)) %>%   unique() %>% 
  mutate(organ_name = factor(case_when(organ_name == "Male reproductive system" ~ 
                                         "Male tissues",
                                       organ_name == "Breast and female reproductive system" ~
                                         "Female tissues",
                                       organ_name == "Adipose & soft tissue" ~ 
                                         "Connective & soft tissue",
                                       organ_name == "Bone marrow & immune system" ~
                                         "Bone marrow & lymphoid tissues",
                                       T ~ organ_name),
                             c("Brain",
                               "Eye",
                               "Endocrine tissues",
                               "Respiratory system",
                               "Proximal digestive tract",
                               "Gastrointestinal tract",
                               "Liver & gallbladder",
                               "Kidney & urinary bladder",
                               "Pancreas",
                               "Male tissues",
                               "Female tissues",
                               "Muscle tissues",
                               "Connective & soft tissue",
                               "Skin",
                               "Bone marrow & lymphoid tissues")))

plot_data1 <- plot_data1 %>% 
  arrange(organ_name,
          consensus_tissue_name,
          region_tissue_name,
          tissue_name) %>% 
  mutate(plot_order = row_number())

plot_data2 <- 
  plot_data1 %>%
  select(-region_tissue_name) %>%
  gather(column, label, -plot_order) %>%
  group_by(label, column) %>% 
  summarise(plot_order = mean(plot_order))  %>%
  ungroup() %>% 
  mutate(label = label,
         column = factor(column,
                         c("organ_name", 
                           "consensus_tissue_name", 
                           "tissue_name")))
Warning: attributes are not identical across measure variables;
they will be dropped`summarise()` has grouped output by 'label'. You can override using the `.groups` argument.
plot_data3 <- 
  plot_data1 %>% 
  group_by(consensus_tissue_name) %>% 
  mutate(left_pos = mean(plot_order))

plot_data4 <- 
  plot_data2 %>% 
  left_join(tissue_colors_palette_full,
            by = c("label" = "name")) %>% 
  group_by(column, label) %>% 
  summarise(miny = min(plot_order) - 0.5,
            maxy = max(plot_order) + 0.5)
`summarise()` has grouped output by 'column'. You can override using the `.groups` argument.
ggplot() +
  geom_rect(data = plot_data4, 
           aes(xmin = column, xmax = column, ymin = miny, ymax =  maxy, fill = label), 
           show.legend = F
         ) +
    geom_rect(data = plot_data4 %>% filter (column == "tissue_name"), 
          # aes(xmin = column, xmax = column, ymin = miny, ymax = maxy, color = color), 
           aes(xmin = 3, xmax = 4, ymin = miny, ymax =  maxy, fill = label), 
           show.legend = F
         ) +
    geom_rect(data = plot_data4 %>% filter (column == "consensus_tissue_name"), 
          # aes(xmin = column, xmax = column, ymin = miny, ymax = maxy, color = color), 
           aes(xmin = 1, xmax = 2, ymin = miny, ymax =  maxy, fill = label), 
           show.legend = F, width = 10
         ) +
  geom_segment(
    data = plot_data3,
    aes(
      x = "consensus_tissue_name",
      xend = "tissue_name",
      y = left_pos,
      yend = plot_order,
      color = tissue_name
    ),
    show.legend = F,
    alpha = 0.5,
    size = 2
  )+
  geom_text(
    data = plot_data2 %>% filter(column == "consensus_tissue_name"),
    aes(x = column, y = plot_order, label = label),
    hjust = 1,
    size = 2 * 5 / 6,
    label.padding = unit(0, "mm")
  ) +
  geom_text(
    data = plot_data2 %>% filter(column == "tissue_name"),
    aes(x = column, y = plot_order, label = label),
    hjust = 0,
    size = 2 * 5 / 6,
    show.legend = F,
    label.padding = unit(0, "mm")
  ) +
  geom_label(
    data = plot_data2 %>%
      filter(column == "organ_name"),
    #aes(x = column, y = plot_order, label = label),
    aes(x = column, y = plot_order, label = gsub(" ", "\n", label)),
    show.legend = F,
    label.size = 0,
    hjust = 1,
    lineheight = 0.7,
    label.padding = unit(0, "mm"),
    size = 2 * 5 / 6
  ) +
  scale_y_reverse() +
  scale_x_discrete(labels  = c('Organ system', "Grouped tissue", "Tissue type"),position = "top") +
  scale_fill_manual(values = pal) +
  scale_color_manual(values = pal) +
  theme_void() +
  theme(axis.text.x = element_text())
Warning: Ignoring unknown parameters: `width`Warning: Ignoring unknown parameters: `label.padding`Warning: Ignoring unknown parameters: `label.padding`
ggsave("final_plots/tissues.pdf", height = 7, width = 4)

##Figure 1B - Ward’s Retina-Dendrogramm


##Functino based on Max Karlsson's retinogramm
circular_dendrogram_retinastyle_2 <-
  function(clust, color_mapping, label_col, color_col, 
           scale_expansion = c(0.25, 0.25), text_size = 3, width_range = c(1.5, 6), 
           arc_strength = 0.8, default_color = "gray80") {
    require(ggraph)
    require(igraph)
    require(viridis)
    require(tidyverse)
    require(magrittr)
    
    dendrogram <-
      clust %>%
      as.dendrogram()
    
    
    
    g <-
      ggraph(dendrogram, layout = 'dendrogram', circular = T)
    # 
    # g +
    #   geom_edge_fan(data = edge_data %>%
    #                    mutate(hghl = edge.id == 99),
    #                  aes(label = edge_id, color = as.factor(rank_radius)),
    #                  width =4) +
    #   geom_node_text(aes(label = label))
    
    edge_data <- 
      get_edges()(g$data) %>%
      as_tibble() %>%
      left_join(color_mapping %>%
                  select(label = label_col,
                         color = color_col),
                by = c("node2.label" = "label")) %>%
      mutate(radius = xend^2 + yend^2,
             edge.id = as.character(edge.id)) %>%
      arrange(-radius) %>%
      mutate(edge_id = as.character(row_number()),
             rank_radius = unclass(factor(-radius)),
             x_m = round(x, 10),
             y_m = round(y, 10),
             xend_m = round(xend, 10),
             yend_m = round(yend, 10)) 
    
    
    edge_id_colors <- 
      edge_data %>% 
      filter(!is.na(color)) %$%
      set_names(color, edge_id)
    
    
    for(rank_rad in 2:max(edge_data$rank_radius)) {
      edge_id_colors_new <- 
        left_join(edge_data %>%
                    select(edge_id, radius, xend_m, yend_m, rank_radius) %>%
                    filter(rank_radius == rank_rad),
                  edge_data %>%
                    select(edge_id, radius, x_m, y_m, rank_radius) %>%
                    filter(rank_radius < rank_rad),
                  by = c("xend_m" = "x_m", "yend_m" = "y_m")) %>%
        left_join(enframe(edge_id_colors),
                  by = c("edge_id.y" = "name")) %>%
        group_by(edge_id.x) %>% 
        summarise(color = ifelse(n_distinct(value) == 1 & any(value != default_color), 
                                 as.character(unique(value)),
                                 default_color)) %$%
        set_names(color, edge_id.x)
      edge_id_colors <- 
        c(edge_id_colors, edge_id_colors_new)
    }
    
    
    
    g +
      scale_edge_width(range = width_range)+
      geom_edge_diagonal(data = edge_data,
                         aes(edge_color = as.character(edge_id),
                             edge_width = 1 - sqrt(xend^2 + yend^2)),
                         strength = arc_strength,
                         show.legend = F) +
      
      
      scale_edge_color_manual(values = edge_id_colors)  +
      g$data %>%
      filter(label != "") %>%
      mutate(degree = case_when(x >= 0 ~ asin(y) * 180 / pi,
                                x < 0 ~ 360 - asin(y) * 180 / pi)) %>%
      left_join(color_mapping %>%
                  select(label = label_col,
                         color = color_col),
                by = "label") %>%
                {geom_node_text(data = .,
                                aes(label = label),
                                angle = .$degree,
                                hjust = ifelse(.$x < 0, 
                                               1, 
                                               0),
                                vjust = 0.5,
                                size = text_size)}  +
      scale_x_continuous(expand = expand_scale(scale_expansion)) +
      scale_y_continuous(expand = expand_scale(scale_expansion)) +
      
      coord_fixed() +
      theme_void()
  }

hclust4RNAseq_ward <- function(df, correlation_method = "spearman"){
  #wide dataframe as input 
  #to get correlation between samples, where rows are genes columns are samples
  #to get correlation between genes across samples, input df with genes as columns
  #can use later for dendogram making: ggdendrogram([hclust4RNAseq_results], rotate = FALSE, size = 10, face = "bold")
  similarity <- cor(df, method=correlation_method, use="pairwise.complete.obs")
  dissimilarity <- 1 - similarity
  hcl <- hclust(as.dist(dissimilarity), "ward.D2")
  return (hcl)
} 

tissue_dendro_ward <- hclust4RNAseq_ward(tmm_tissue %>% mutate(tissue_name = str_to_sentence(tissue_name)) %>%  spread(tissue_name, tmm) %>% column_to_rownames("target_id"))

circular_dendrogram_retinastyle_2(
  clust = tissue_dendro_ward, 
  color_mapping = metadata %>% 
    select(tissue_name, tissue_color) %>% 
    mutate(tissue_name = str_to_sentence(tissue_name)), 
  label_col = "tissue_name", 
  color_col = "tissue_color", 
  scale_expansion = c(0.7, 0.7), 
  text_size = 2.4, 
  width_range = c(0.5, 4),
  arc_strength = 0.4, 
  default_color = "gray80")

ggsave("./final_plots/data_presentation/retinagram_all_tissue_clust_ward.pdf", width = 6, height = 6)

##Figure 1C - Spearman heatmap (grouped tissue)

##Spearman's roh heatmap at grouped tissue level

if(file.exists("./data/final_data/spearman_corr_consensus_tissues.csv")) {
  consensus_tmm_spearman <- read_csv("./data/final_data/spearman_corr_consensus_tissues.csv")
} else {
  consensus_tmm_spearman <-  tmm_consensus_tissue %>% 
    spread(consensus_tissue_name, tmm) %>% 
    column_to_rownames("target_id") %>% 
    cor(method="spearman", use="pairwise.complete.obs") %>% 
    as.data.frame() %>% 
    as_tibble(rownames = "consensus_tissue_name")
  write_csv(consensus_tmm_spearman,"./data/final_data/spearman_corr_consensus_tissues.csv")
}
Rows: 53 Columns: 54── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr  (1): consensus_tissue_name
dbl (53): adipose tissue, adrenal gland, aorta, bone marrow, brain, breast, cartilage, cervix, choroid plexus, circ...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
consensus_tmm_spearman %>% 
  rename_with(str_to_sentence, -1) %>% 
  mutate(consensus_tissue_name = str_to_sentence(consensus_tissue_name)) %>% 
  column_to_rownames("consensus_tissue_name") %>% 
  pheatmap(
   # clustering_method = "ward.D2",
    cellheight = 8,
    cellwidth = 8, 
    border_color = NA,
    color = viridis::inferno(20, direction = -1),
    show_rownames = FALSE, 
    ) %>% 
  as.ggplot()

ggsave("./final_plots/data_presentation/spearman_corr_consensus_tissue.pdf", height = 10, width = 10)

#Figure 2 ##Figure 2A - Sample level PCA Plot


sample_pca <-
  tmm_sample %>% 
  
  # gather(sample_name, tmm, -1) %>%
  # group_by(target_id) %>%
  # mutate(sd = sd(tmm)) %>%
  # ungroup() %>%
  # filter(sd > 0) %>%
  # select(-sd) %>%
  # spread(sample_name, tmm) %>%
  
  mutate_if(is.numeric, function(x){log10(x+1)}) %>%
  column_to_rownames("target_id") %>%
  scale() %>%
  t() %>%
  pca(nPcs = 8)

#sample_pca@scores

summary(sample_pca)
svd calculated PCA
Importance of component(s):
                 PC1     PC2     PC3     PC4     PC5     PC6     PC7     PC8
R2            0.3135 0.09126 0.08083 0.03926 0.03616 0.03489 0.02711 0.02105
Cumulative R2 0.3135 0.40474 0.48557 0.52483 0.56099 0.59588 0.62299 0.64404
plot_data <- sample_pca %>% 
  scores() %>% 
  as_tibble(rownames = "sample_id") %>% 
  left_join(metadata,
            by = c("sample_id" = "ID"))


organ_colors <- metadata %>% select(organ_name, organ_color) %>% unique()
pal <-  organ_colors$organ_color
pal <- setNames(pal, organ_colors$organ_name)

plot_data %>%
  ggplot(aes(PC1, PC2)) +
  geom_point(aes(fill = organ_name), color = "gray25", alpha = 0.7, shape=21, size = 2, stroke = 0.5) +
  scale_fill_manual(values = pal) + 
  #geom_text(aes(label = sample_id), vjust = 1,hjust = 0, nudge_y =-1) +
  xlab(paste("PC1", sample_pca@R2[1] * 100, "% of the variance")) +
  ylab(paste("PC2", sample_pca@R2[2] * 100, "% of the variance")) +
  theme_classic() + theme(legend.text = element_text(size = 10),
                          legend.title =element_blank(), 
                          legend.position = "bottom",
                          legend.spacing.y = unit(-0.3, 'cm'),
                          legend.spacing.x = unit(-0.01, 'cm')) +
  guides(fill = guide_legend(ncol = 3, byrow = TRUE)) 



#ggsave("./final_plots/data_presentation/sample_pca_1.pdf", width = 8, height = 8)
# ggsave("./final_plots/data_presentation/sample_pca_1_w_filter.pdf", width = 8, height = 8)
# 
# plot_data %>%
#   ggplot(aes(PC1, PC2)) +
#   geom_point(aes(fill = organ_name), color = "gray25", alpha = 0.7, shape=21, size = 2, stroke = 0.5) +
#   scale_fill_manual(values = pal) + 
#   #geom_text(aes(label = sample_id), vjust = 1,hjust = 0, nudge_y =-1) +
#   xlab(paste("PC1", sample_pca@R2[1] * 100, "% of the variance")) +
#   ylab(paste("PC2", sample_pca@R2[2] * 100, "% of the variance")) +
#   theme_classic() + theme(legend.text = element_text(size = 10),
#                           legend.title =element_blank(), 
#                           legend.position = "bottom",
#                           legend.spacing.y = unit(-0.3, 'cm'),
#                           legend.spacing.x = unit(-0.01, 'cm')) +
#   guides(fill = guide_legend(ncol = 2, byrow = TRUE)) 
# 
# #ggsave("./final_plots/data_presentation/sample_pca_2.pdf", width = 5, height = 5)
# # ggsave("./final_plots/data_presentation/sample_pca_2_w_filter.pdf", width = 7, height = 7)

###Extra PCA plots not in report

sample_pca <-
  tmm_sample %>% 
  # gather(sample_name, tmm, -1) %>% 
  # group_by(target_id) %>%
  # mutate(sd = sd(tmm)) %>%
  # ungroup() %>% 
  # filter(sd > 0) %>% 
  # select(-sd) %>% 
  # spread(sample_name, tmm) %>% 
  mutate_if(is.numeric, function(x){log10(x+1)}) %>%
  column_to_rownames("target_id") %>%
  scale() %>%
  t() %>%
  pca(nPcs = 8)

#sample_pca@scores

summary(sample_pca)
svd calculated PCA
Importance of component(s):
                 PC1     PC2     PC3     PC4     PC5     PC6     PC7     PC8
R2            0.3135 0.09126 0.08083 0.03926 0.03616 0.03489 0.02711 0.02105
Cumulative R2 0.3135 0.40474 0.48557 0.52483 0.56099 0.59588 0.62299 0.64404
plot_data <- sample_pca %>% 
  scores() %>% 
  as_tibble(rownames = "sample_id") %>% 
  left_join(metadata,
            by = c("sample_id" = "ID"))


region_colors <- metadata %>% select(region_tissue_name, region_tissue_color, organ_name) %>% unique() %>% filter(organ_name == "Brain") %>% select(-organ_name)
pal <-  region_colors$region_tissue_color
pal <- setNames(pal, region_colors$region_tissue_name)

plot_data %>%
  ggplot(aes(PC1, PC2)) +
  geom_point(aes(fill = region_tissue_name),  color = "gray25", alpha = 0.7, shape=21, size = 2, stroke = 0.5) +
  scale_fill_manual(values = pal, na.value = "#FFFFFF") + 
  xlim(-75,0) +
  ylim(-10,5) +
  xlab(paste("PC1", sample_pca@R2[1] * 100, "% of the variance")) +
  ylab(paste("PC2", sample_pca@R2[2] * 100, "% of the variance")) +
  theme_classic() + theme(legend.text = element_text(size = 10),
                          legend.title =element_blank(), 
                          legend.position = "bottom",
                          legend.spacing.y = unit(-0.3, 'cm'),
                          legend.spacing.x = unit(-0.01, 'cm')) +
  guides(fill = guide_legend(ncol = 2, byrow = TRUE)) 

ggsave("./final_plots/data_presentation/brain_w_all_sample_pca.pdf", width = 5, height = 5)

NA
NA

brain_pca <-
  tmm_sample %>% select(c(target_id, metadata %>% filter(organ_name =="Brain") %>% .$ID))%>% 
  mutate_if(is.numeric, function(x){log10(x+1)}) %>%
  column_to_rownames("target_id") %>%
  scale() %>%
  t() %>%
  pca(nPcs = 8)

#brain_pca@scores


plot_data <- brain_pca %>% 
  scores() %>% 
  as_tibble(rownames = "sample_id") %>% 
  left_join(metadata,
            by = c("sample_id" = "ID"))


region_colors <- metadata %>% select(region_tissue_name, region_tissue_color) %>% unique()
pal <-  region_colors$region_tissue_color
pal <- setNames(pal, region_colors$region_tissue_name)

plot_data %>%
  ggplot(aes(PC1, PC2)) +
  geom_point(aes(fill = region_tissue_name), color = "gray25", alpha = 0.7, shape=21, size = 2, stroke = 0.5) +
  scale_fill_manual(values = pal) + 
  #geom_text(aes(label = sample_id), vjust = 1,hjust = 0, nudge_y =-1) +
  xlab(paste("PC1", brain_pca@R2[1] * 100, "% of the variance")) +
  ylab(paste("PC2", brain_pca@R2[2] * 100, "% of the variance")) +
  theme_classic() + theme(legend.text = element_text(size = 10),
                          legend.title =element_blank(), 
                          legend.position = "bottom",
                          legend.spacing.y = unit(-0.3, 'cm'),
                          legend.spacing.x = unit(-0.01, 'cm')) +
  guides(fill = guide_legend(ncol = 2, byrow = TRUE)) 

ggsave("./final_plots/data_presentation/brain_sample_pca.pdf", width = 5, height = 5)

NA
NA
NA

##Figure 2B - UMAP Plot

wide_data = tmm_sample 
seed = 4
n_epochs = 1000
n_neighbors = 15
set.seed(seed)
    
pca_res <-
  wide_data %>% 
  mutate_if(is.numeric, function(x){log10(x+1)}) %>% 
  column_to_rownames(colnames(wide_data)[1]) %>%
  scale() %>%
  t() %>%
  pca(nPcs = dim(.)[1])
    
    
pc_lim <-
  which(pca_res@R2cum > 0.8)[1]

pc_lim_sd <-
  rev(which(pca_res@sDev > 1))[1]

set.seed(seed)
umap_res <- pca_res@scores[, 1:pc_lim] %>%
  umap(n_neighbors = n_neighbors,
       n_epochs = n_epochs) %>%
       #metric = "correlation") %>%
  as_tibble() %>%
  set_names(paste0("UMAP", 1:ncol(.))) %>%
  mutate(sample = rownames(pca_res@scores)) %>%
  select(sample, everything()) %>% 
  left_join(metadata, by = c("sample" = "ID"))

organ_colors <- metadata %>% select(organ_name, organ_color) %>% unique()
pal <-  organ_colors$organ_color
pal <- setNames(pal, organ_colors$organ_name)

# umap_res %>%  ggplot(aes(UMAP1, UMAP2,  color = organ_name)) +
#    geom_point(alpha = 0.8) + scale_color_manual(values = pal)
umap_res %>%  ggplot(aes(UMAP1, UMAP2)) +
  geom_point(
    aes(fill = organ_name),
    color = "gray25",
    alpha = 0.7,
    shape = 21,
    size = 2,
    stroke = 0.5
  ) + scale_fill_manual(values = pal) + 
  # theme_bw() + theme(
  #   panel.border = element_blank(),
  #   panel.grid.major = element_blank(),
  #   panel.grid.minor = element_blank(),
  #   axis.line = element_line(colour = "black"),
  #   legend.title = element_blank()
  # )
  theme_classic() + theme(legend.text = element_text(size = 10),
                          legend.title =element_blank(), 
                          legend.position = "bottom",
                          legend.spacing.y = unit(-0.3, 'cm'),
                          legend.spacing.x = unit(-0.01, 'cm')) +
  guides(fill = guide_legend(ncol = 2, byrow = TRUE)) 

ggsave("./final_plots/data_presentation/sample_umap_euclidean.pdf", width = 5, height = 5.5)

##Figure 2C - Brain Umap Plot

#Plot focusing only on brain samples, but UMAP was based on the whole sample set, not only brian samples.

wide_data = tmm_sample 
seed = 4
n_epochs = 1000
n_neighbors = 15
set.seed(seed)
    
pca_res <-
  wide_data %>% 
  mutate_if(is.numeric, function(x){log10(x+1)}) %>% 
  column_to_rownames(colnames(wide_data)[1]) %>%
  scale() %>%
  t() %>%
  pca(nPcs = dim(.)[1])
    
    
pc_lim <-
  which(pca_res@R2cum > 0.8)[1]

pc_lim_sd <-
  rev(which(pca_res@sDev > 1))[1]

set.seed(seed)
umap_res <- pca_res@scores[, 1:pc_lim] %>%
  umap(n_neighbors = n_neighbors,
       n_epochs = n_epochs) %>%
       #metric = "correlation") %>%
  as_tibble() %>%
  set_names(paste0("UMAP", 1:ncol(.))) %>%
  mutate(sample = rownames(pca_res@scores)) %>%
  select(sample, everything()) %>% 
  left_join(metadata, by = c("sample" = "ID"))


region_colors <- metadata %>% select(region_tissue_name, region_tissue_color, organ_name) %>% unique() %>% filter(organ_name == "Brain") %>% select(-organ_name)
pal <-  region_colors$region_tissue_color
pal <- setNames(pal, region_colors$region_tissue_name)
# umap_res %>%  ggplot(aes(UMAP1, UMAP2,  color = organ_name)) +
#    geom_point(alpha = 0.8) + scale_color_manual(values = pal)
umap_res %>%  ggplot(aes(UMAP1, UMAP2)) +
  geom_point(
    aes(fill = region_tissue_name),
    color = "gray25",
    alpha = 0.7,
    shape = 21,
    size = 2,
    stroke = 0.5
  ) + 
  scale_fill_manual(values = pal, na.value = "#FFFFFF") + 
  # theme_bw() + theme(
  #   panel.border = element_blank(),
  #   panel.grid.major = element_blank(),
  #   panel.grid.minor = element_blank(),
  #   axis.line = element_line(colour = "black"),
  #   legend.title = element_blank()
  # )
  xlim(-11,-6) +
  ylim(-8,-2) +
  theme_classic() + theme(legend.text = element_text(size = 10),
                          legend.title =element_blank(), 
                          legend.position = "bottom",
                          legend.spacing.y = unit(-0.3, 'cm'),
                          legend.spacing.x = unit(-0.01, 'cm')) +
  guides(fill = guide_legend(ncol = 2, byrow = TRUE)) 

ggsave("./final_plots/data_presentation/sample_focus_brain_umap_euclidean.pdf", width = 5, height = 5.5)

###Extra UMAP plot not in report

#UMAP plot based only on brain samples, thus different than the plot above.

wide_data = tmm_sample %>% select(c(target_id, metadata %>% filter(organ_name == "Brain") %>% .$ID))
seed = 4
n_epochs = 1000
n_neighbors = 15
set.seed(seed)
    
pca_res <-
  wide_data %>% 
  mutate_if(is.numeric, function(x){log10(x+1)}) %>% 
  column_to_rownames(colnames(wide_data)[1]) %>%
  scale() %>%
  t() %>%
  pca(nPcs = dim(.)[1])
    
    
pc_lim <-
  which(pca_res@R2cum > 0.8)[1]

pc_lim_sd <-
  rev(which(pca_res@sDev > 1))[1]

set.seed(seed)
umap_res <- pca_res@scores[, 1:pc_lim] %>%
  umap(n_neighbors = n_neighbors,
       n_epochs = n_epochs) %>%
       #metric = "correlation") %>%
  as_tibble() %>%
  set_names(paste0("UMAP", 1:ncol(.))) %>%
  mutate(sample = rownames(pca_res@scores)) %>%
  select(sample, everything()) %>% 
  left_join(metadata, by = c("sample" = "ID"))

region_tissue_colors <- metadata %>% select(region_tissue_name, region_tissue_color) %>% unique()
pal <-  region_tissue_colors$region_tissue_color
pal <- setNames(pal, region_tissue_colors$region_tissue_name)

# umap_res %>%  ggplot(aes(UMAP1, UMAP2,  color = organ_name)) +
#    geom_point(alpha = 0.8) + scale_color_manual(values = pal)
umap_res %>%  ggplot(aes(UMAP1, UMAP2)) +
  geom_point(
    aes(fill = region_tissue_name),
    color = "gray25",
    alpha = 0.7,
    shape = 21,
    size = 2,
    stroke = 0.5
  ) + scale_fill_manual(values = pal) + 
  # theme_bw() + theme(
  #   panel.border = element_blank(),
  #   panel.grid.major = element_blank(),
  #   panel.grid.minor = element_blank(),
  #   axis.line = element_line(colour = "black"),
  #   legend.title = element_blank()
  # )
  theme_classic() + theme(legend.text = element_text(size = 10),
                          legend.title =element_blank(), 
                         # legend.position = "bottom",
                          legend.spacing.y = unit(-0.3, 'cm'),
                          legend.spacing.x = unit(-0.01, 'cm')) +
  guides(fill = guide_legend(ncol = 1, byrow = TRUE)) 

ggsave("./final_plots/data_presentation/sample_brain_umap_euclidean_l.pdf", width = 5.5, height = 3)

##Figure 2D - grouped sample to sample correlation plots

if(file.exists("./data/final_data/spearman_sample.csv")) {
  sample_tmm_spearman <- read_csv("./data/final_data/spearman_sample.csv")
} else {
  sample_tmm_spearman <-  tmm_sample %>%
    column_to_rownames("target_id") %>%
    cor(method = "spearman", use = "pairwise.complete.obs") %>%
    as.data.frame() %>%
    as_tibble(rownames = "sample_name")
  write_csv(as.data.frame(sample_tmm_spearman) %>% as_tibble(),"./data/final_data/spearman_sample.csv")
}
Rows: 352 Columns: 353── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr   (1): sample_name
dbl (352): ventral.forebrain_male.3, ventral.forebrain_male.2, ventral.forebrain_female.3, ventral.forebrain_female...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
correlation_to_different_organs <- tibble(sample_name = c(), correlation = c())
for (sample in metadata$ID){
  sample_organ <- metadata %>% filter(ID == sample) %>% .$organ_name %>% unique()
  different_organ_samples <- metadata %>% filter(organ_name != sample_organ) %>% .$ID
  sample_correlation <- sample_tmm_spearman %>% filter(sample_name %in% different_organ_samples) %>% select("sample_name", sample) %>% rename("correlation" = sample)
  correlation_to_different_organs <- rbind(correlation_to_different_organs, sample_correlation) 
}

correlation_to_same_organ <- tibble(sample_name = c(), correlation = c())
for (sample in metadata$ID){
  sample_organ <- metadata %>% filter(ID == sample) %>% .$organ_name %>% unique()
  same_organ_samples <- metadata %>% filter(organ_name == sample_organ) %>% filter(ID != sample) %>% .$ID
  sample_correlation <- sample_tmm_spearman %>% filter(sample_name %in% same_organ_samples) %>% select("sample_name", sample) %>% rename("correlation" = sample)
  correlation_to_same_organ <- rbind(correlation_to_same_organ, sample_correlation) 
}

correlation_to_same_tissue <- tibble(sample_name = c(), correlation = c())
for (sample in metadata$ID){
  sample_tissue <- metadata %>% filter(ID == sample) %>% .$tissue_name %>% unique()
  same_tissue_samples <- metadata %>% filter(tissue_name == sample_tissue) %>% filter(ID != sample) %>% .$ID
  sample_correlation <- sample_tmm_spearman %>% filter(sample_name %in% same_tissue_samples) %>% select("sample_name", sample) %>% rename("correlation" = sample)
  correlation_to_same_tissue <- rbind(correlation_to_same_tissue, sample_correlation) 
}

correlation_to_same_tissue_by_tissue <- tibble(sample_name = c(), correlation = c(), tissue_name = c())
for (sample in metadata$ID){
  sample_tissue <- metadata %>% filter(ID == sample) %>% .$tissue_name %>% unique()
  same_tissue_samples <- metadata %>% filter(tissue_name == sample_tissue) %>% .$ID
  sample_correlation <- sample_tmm_spearman %>% filter(sample_name %in% same_tissue_samples) %>% select("sample_name", sample) %>% filter(sample_name != sample) %>%  left_join(metadata %>% select(ID, tissue_name), by= c("sample_name" = "ID")) %>% rename("correlation" = sample)
  correlation_to_same_tissue_by_tissue <- rbind(correlation_to_same_tissue_by_tissue, sample_correlation) 
}

correlation_to_same_tissue_by_tissue <- correlation_to_same_tissue_by_tissue %>% 
  mutate(tissue_name = str_to_sentence(tissue_name)) %>% 
  group_by(tissue_name) %>% 
  mutate(min = min(correlation)) %>% 
  ungroup() %>% 
  arrange(min)

p1 <- correlation_to_different_organs %>%
  ggplot(aes(correlation)) +
  geom_histogram(bins = 100)  + xlim(0.5,1)+
  theme_classic() + 
  theme(panel.background = element_rect("gray90"),
        axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        axis.title.y.left = element_blank(),
        axis.title.x = element_blank()) +
  scale_y_continuous(expand = expansion(mult = 0, add = 0)) + 
  ggtitle("Different organs")

p2 <- correlation_to_same_organ %>%
  ggplot(aes(correlation)) +
  geom_histogram(bins = 100) + xlim(0.5,1) +
  theme_classic() + 
  theme(panel.background = element_rect("gray90"),
        axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        axis.title.x = element_blank()) +
  scale_y_continuous(expand = expansion(mult = 0, add = 0)) +
  ggtitle("Same organs")

p3 <- correlation_to_same_tissue %>%
  ggplot(aes(correlation)) +
  geom_histogram(bins = 100) + xlim(0.5,1) +
  theme_classic() +
  theme(panel.background = element_rect("gray90"),
        axis.title.y = element_blank(),
        axis.text.x = element_blank(),
        axis.title.x = element_blank()
        ) +
  scale_y_continuous(expand = expansion(mult = 0, add = 0)) +
  ggtitle("Same tissue")

p4 <- correlation_to_same_tissue_by_tissue %>% 
  mutate(tissue_name = factor(tissue_name, correlation_to_same_tissue_by_tissue$tissue_name %>% 
                                unique())) %>% 
  ggplot(aes(x = correlation, y = tissue_name)) +
  geom_boxplot(outlier.size=0.3) + xlim(0.5,1) +
  theme_bw() + 
  xlab ("Spearman's roh") +
  theme(axis.title.y = element_blank())

p1 / p2 / p3 / p4 + plot_layout(heights = c(1, 1, 1, 15))

ggsave("./final_plots/data_presentation/spearman_corr_sample_vs_tissue_organ.pdf", height = 14, width = 4)

#Specificity and distribution annotation ##Formulas

#calculate_tau_score and hpa_gene_classification taken from Max Karlsson
calculate_tau_score <- 
  function(wide_data) {
    max_exp <- 
      apply(wide_data,
            MARGIN = 1,
            function(x) max(x, na.rm = T))
    
    N <- 
      apply(wide_data,
            MARGIN = 1,
            function(x) length(which(!is.na(x))))
    
    expression_sum <- 
      wide_data %>% 
      sweep(MARGIN = 1, 
            STATS = max_exp, 
            FUN = `/`) %>% 
      {1 - .} %>% 
      apply(MARGIN = 1,
            function(x) sum(x, na.rm = T))
    
    
    tau_score <- 
      (expression_sum / (N - 1)) %>% 
      enframe("gene", "tau_score")
    
    tau_score
  }

hpa_gene_classification <- 
  #feed in long data
  function(data, expression_col, tissue_col, gene_col, enr_fold, max_group_n, det_lim = 1) {
    data_ <- 
      data %>% 
      select(gene = gene_col,
             expression = expression_col,
             tissue = tissue_col) %>% 
      mutate(expression = round(expression, 4)) 
    
    if(any(is.na(data_$expression))) stop("NAs in expression column")
    if(any(is.na(data_$gene))) stop("NAs in gene column")
    if(any(is.na(data_$tissue))) stop("NAs in tissue column")
    
    n_groups <- length(unique(data_$tissue))
    
    gene_class_info <- 
      data_ %>%
      group_by(gene) %>%
      summarise(
        
        # Gene expression distribution metrics
        mean_exp = mean(expression, na.rm = T),
        min_exp = min(expression, na.rm = T),
        max_exp = max(expression, na.rm = T), 
        max_2nd = sort(expression)[length(expression)-1],
        
        # Expression frequency metrics
        n_exp = length(which(expression >= det_lim)),
        frac_exp = n_exp/length(expression[!is.na(expression)])*100,
        
        # Limit of enhancement metrics
        lim = max_exp/enr_fold, 
        
        exps_over_lim = list(expression[which(expression >= lim & expression >= det_lim)]),
        n_over = length(exps_over_lim[[1]]), 
        mean_over = mean(exps_over_lim[[1]]),
        min_over = ifelse(n_over == 0, NA,
                          min(exps_over_lim[[1]])),
        
        max_under_lim = max(expression[which(expression < min_over)], det_lim*0.1),
        
        
        exps_enhanced = list(which(expression/mean_exp >= enr_fold & expression >= det_lim)),
        
        
        
        
        # Expression patterns
        enrichment_group = paste(sort(tissue[which(expression >= lim & expression >= det_lim)]), collapse=";"),
        
        n_enriched = length(tissue[which(expression >= lim & expression >= det_lim)]),
        n_enhanced = length(exps_enhanced[[1]]), 
        enhanced_in = paste(sort(tissue[exps_enhanced[[1]]]), collapse=";"),
        n_na = n_groups - length(expression),
        max_2nd_or_lim = max(max_2nd, det_lim*0.1),
        tissues_not_detected = paste(sort(tissue[which(expression < det_lim)]), collapse=";"),
        tissues_detected = paste(sort(tissue[which(expression >= det_lim)]), collapse=";")) 
    
    
    gene_categories <- 
      gene_class_info %>%
      
      mutate(
        spec_category = case_when(n_exp == 0 ~ "not detected", 
                                  
                                  # Genes with expression fold times more than anything else are tissue enriched
                                  max_exp/max_2nd_or_lim >= enr_fold ~ "tissue enriched", 
                                  
                                  # Genes with expression fold times more than other tissues in groups of max group_n - 1 are group enriched
                                  max_exp >= lim &
                                    n_over <= max_group_n & n_over > 1 &
                                    mean_over/max_under_lim >= enr_fold ~ "group enriched", 
                                  
                                  # Genes with expression in tissues fold times more than the mean are tissue enhance
                                  n_enhanced > 0 ~ "tissue enhanced", 
                                  
                                  # Genes expressed with low tissue specificity
                                  T ~ "low tissue specificity"), 
        
        
        dist_category = case_when(frac_exp == 100 ~ "detected in all",
                                  frac_exp >= 31 ~ "detected in many",
                                  n_exp > 1 ~ "detected in some",
                                  n_exp == 1 ~ "detected in single",
                                  n_exp == 0 ~ "not detected"),
        
        spec_score = case_when(spec_category == "tissue enriched" ~ max_exp/max_2nd_or_lim,
                               spec_category == "group enriched" ~ mean_over/max_under_lim, 
                               spec_category == "tissue enhanced" ~ max_exp/mean_exp)) 
    
    
    
    
    ##### Rename and format
    gene_categories %>%
      mutate(enriched_tissues = case_when(spec_category %in% c("tissue enriched", "group enriched") ~ enrichment_group,
                                          spec_category == "tissue enhanced" ~ enhanced_in),
             n_enriched = case_when(spec_category %in% c("tissue enriched", "group enriched") ~ n_enriched,
                                    spec_category == "tissue enhanced" ~ n_enhanced)) %>%
      select(gene, 
             spec_category, 
             dist_category, 
             spec_score,
             n_expressed = n_exp, 
             fraction_expressed = frac_exp,
             max_exp = max_exp,
             enriched_tissues,
             n_enriched,
             n_na = n_na,
             tissues_not_detected,
             tissues_detected) 
    
  } 

##Annotation


# Specificity classification at consensus level
classification_consensus <- hpa_gene_classification(data = tmm_consensus_tissue, expression_col = "tmm", tissue_col = "consensus_tissue_name", gene_col = "target_id", enr_fold = 4, max_group_n = 5, det_lim = 1)

consensus_tau <- calculate_tau_score(tmm_consensus_tissue  %>% 
                                     spread(consensus_tissue_name, tmm)%>%  
                                       mutate_if(is.numeric, function(x){log10(x+1)})%>% 
                                       column_to_rownames("target_id")) 
#category not detected has a very noisy tau, so no tau score for those
classification_consensus <- classification_consensus %>% 
  left_join(consensus_tau, by = c("gene" = "gene")) %>% 
  mutate(tau_score = ifelse(spec_category == "not detected", NA, tau_score))

#at region level
classification_region <- hpa_gene_classification(data = tmm_region_tissue, expression_col = "tmm", tissue_col = "region_tissue_name", gene_col = "target_id", enr_fold = 4, max_group_n = 5, det_lim = 1)

region_tau <- calculate_tau_score(tmm_region_tissue  %>% 
                                    spread(region_tissue_name, tmm) %>%  
                                    mutate_if(is.numeric, function(x){log10(x+1)}) %>% 
                                    column_to_rownames("target_id") )

classification_region <- classification_region %>% 
  left_join(region_tau, by = c("gene" = "gene")) %>% 
  mutate(tau_score = ifelse(spec_category == "not detected", NA, tau_score))

#at tissue level
classification_tissue <- hpa_gene_classification(data = tmm_tissue, expression_col = "tmm", tissue_col = "tissue_name", gene_col = "target_id", enr_fold = 4, max_group_n = 5, det_lim = 1)

tissue_tau <- calculate_tau_score(tmm_tissue %>% 
                                    spread(tissue_name, tmm) %>%  
                                    mutate_if(is.numeric, function(x){log10(x+1)})%>%
                                    column_to_rownames("target_id") )

classification_tissue <- classification_tissue %>% 
  left_join(tissue_tau, by = c("gene" = "gene")) %>% 
  mutate(tau_score = ifelse(spec_category == "not detected", NA, tau_score))

#Figure 3 ##Figure 3A - Pie charts at tissue type level

gene_category_pal <- c("Detected in all" = "#253494",
                       "Detected in many" = "#2c7fb8",
                       "Detected in some" = "#41b6c4",
                       "Detected in single" = "#a1dab4",
                       "Not detected " = "  #bebebe",
                       "Low tissue specificity" = "grey40",
                       "Tissue enhanced" = "#984ea3",
                       "Group enriched" = "#FF9D00",
                       "Tissue enriched" = "#e41a1c")

plot_data <-
  classification_tissue %>% 
  rename(specificity = spec_category, distribution = dist_category) %>% 
  select(gene, specificity, distribution) %>% 
  gather(class_type, class, -1) %>% 
  group_by(class_type, class) %>% 
  count() %>%
  group_by(class_type) %>% 
  mutate(perc = 100 * n / sum(n),
         label = paste0(n, "\n", round(perc, 1), "%")) %>% 
  ungroup() %>% 
  mutate(class = factor(str_to_sentence(class), str_to_sentence(c("tissue enriched", "group enriched",  "tissue enhanced", "low tissue specificity", "detected in all", "detected in many", "detected in some", "detected in single", "not detected"))),
         class_type = factor(str_to_sentence(class_type),
                             c("Specificity", "Distribution")))
    
plot_dodge = 0.1
plot_data %>% 
  arrange(match(class, rev(levels(class)))) %>% 
  group_by(class_type) %>% 
  mutate(y_stack = cumsum(n) - n/2) %>% 
  {ggplot(., aes(1, n, fill = class, group = class, 
                 label = label)) +
      geom_col(show.legend = F, 
               color = "white", 
               width = 1) + 
      geom_segment(aes(x = 1.5 + plot_dodge, xend = 1.5, 
                       y = y_stack, yend = y_stack), size = 0.5) +
      geom_text_repel(aes(x = 1.5 + plot_dodge, y = y_stack), 
                      color = "black", nudge_x = plot_dodge, 
                      segment.size = 0.5, size = 24/11) +
      scale_fill_manual(values = gene_category_pal) + 
      facet_wrap(~class_type) +
      coord_polar("y",start = 0) +
      theme_void() + 
      scale_x_continuous(expand = expansion(c(0,0.8)))}


ggsave("./final_plots/classification/class_pies_tissue_type.pdf",width = 6, height = 5)

##Figure 3B - Distibution for each tissue type


classification_tissue %>% group_by(dist_category) %>% count()

ordered_names_distr <- classification_tissue %>% 
  filter(dist_category %in% c("detected in single", "detected in some", "detected in many", "detected in all")) %>% 
  separate_rows(tissues_detected, sep = ";") %>% 
  group_by(dist_category, tissues_detected) %>% 
  count() %>% 
  group_by(tissues_detected) %>% 
  summarise(sum = sum(n)) %>% 
  arrange(sum) %>% 
  .$tissues_detected %>% 
  str_to_sentence()

detection_palette <- c("Detected in all" = "#253494",
                        "Detected in many" = "#2c7fb8",
                        "Detected in some" = "#41b6c4",
                        "Detected in single" = "#a1dab4",
                        "Not detected " = "grey")


classification_tissue %>%
  filter(
    dist_category %in% c(
      "detected in single",
      "detected in some",
      "detected in many",
      "detected in all"
    )
  ) %>%
  separate_rows(tissues_detected, sep = ";") %>%
  group_by(dist_category, tissues_detected) %>%
  count() %>%
  mutate(tissues_detected = factor(str_to_sentence( tissues_detected), ordered_names_distr)) %>%
  mutate(dist_category = factor(
    str_to_sentence( dist_category),
    c(
      "Detected in single",
      "Detected in some",
      "Detected in many",
      "Detected in all"
    )
  )) %>%
  ggplot(aes(x = n, y = tissues_detected, fill = dist_category)) +
  geom_col() +
  scale_fill_manual(values = detection_palette) +
  theme_classic() + theme(
    axis.title.y = element_blank(),
    axis.line.y = element_blank(),
    axis.title.x = element_blank(),
    legend.position = "bottom",
    legend.title = element_blank()
  ) +
  scale_x_continuous(expand = expansion(mult = 0, add = 0)) +
  ggtitle("Number of detected genes per tissue type")  +
  guides(fill = guide_legend(nrow = 2, byrow = TRUE))

 ggsave("./final_plots/classification/tissue_detection_a.pdf", height = 11.5, width = 4.5)

NA
NA

##Figure 3C - Distribution and Tau score

detection_palette <- c("Detected in all" = "#253494",
                        "Detected in many" = "#2c7fb8",
                        "Detected in some" = "#41b6c4",
                        "Detected in single" = "#a1dab4",
                        "Not detected " = "grey")

p1 <- classification_tissue %>% 
  mutate(dist_category = factor(str_to_sentence( dist_category), levels = rev(names(detection_palette))),
         enriched_tissues = str_to_sentence(enriched_tissues)) %>% 
  filter(dist_category != "Not detected") %>% 
  ggplot(aes(x = tau_score, y = dist_category, fill = dist_category)) +
  geom_violin() +
  scale_fill_manual(values = gene_category_pal, name = "Specificity") +
  xlab("Tau score") +
  theme_bw() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
    axis.title.y = element_blank(), 
    axis.text.y = element_blank(), 
    axis.ticks.y = element_blank()
    ) 

#ggsave("./final_plots/classification/tau_to_distribution.pdf",width = 5.5, height = 4)


p2 <- classification_tissue %>%
  ggplot(aes(tau_score)) +
  geom_histogram(bins = 100) +
  theme_classic() +
  ylab("Count")+
  theme(panel.background = element_rect("gray90"),
        #axis.title.y = element_blank(),
        axis.text.x = element_blank(),
        axis.title.x = element_blank()) +
  scale_y_continuous(expand = expansion(mult = 0, add = 0))

p2/ p1 + plot_layout(heights = c(1, 3))

ggsave("./final_plots/classification/distribution_and_dendrogram_tissue_w_overall.pdf",width = 6.5, height = 6)

###Extra Figures not in report

detection_palette <- c("detected in all" = "#253494",
                        "detected in many" = "#2c7fb8",
                        "detected in some" = "#41b6c4",
                        "detected in single" = "#a1dab4",
                        "not detected " = "grey")

ordered_names_distr <- classification_tissue %>%  
  filter(dist_category %in% c("detected in single"#,
                             # "detected in some"
                              )) %>% 
  separate_rows(tissues_detected, sep = ";") %>% 
  group_by(dist_category, tissues_detected) %>% 
  count() %>% 
  group_by(tissues_detected) %>% 
  summarise(sum = sum(n)) %>% 
  arrange(sum) %>% 
  .$tissues_detected

classification_tissue %>%  
  filter(
    dist_category %in% c(
      "detected in single"#,
#
    )
  ) %>%
  separate_rows(tissues_detected, sep = ";") %>%
  group_by(dist_category, tissues_detected) %>%
  count() %>%
  mutate(tissues_detected = factor(tissues_detected, ordered_names_distr)) %>%
  mutate(dist_category = factor(
    dist_category,
    c(
      "detected in single",
      "detected in some",
      "detected in many",
      "detected in all"
    )
  )) %>%
  ggplot(aes(x = n, y = tissues_detected, fill = dist_category)) +
  geom_col() +
  scale_fill_manual(values = detection_palette) +
  theme_classic() + theme(
    axis.title.y = element_blank(),
    axis.line.y = element_blank(),
    axis.title.x = element_blank(),
    legend.position = "bottom",
    legend.title = element_blank()
  ) +
  scale_x_continuous(expand = expansion(mult = 0, add = 0)) +
#  ggtitle("Number of detected genes per tissue type")  +
  ggtitle("Number of detected genes detected in a single tissue")  +
  guides(fill = guide_legend(nrow = 2, byrow = TRUE))

detection_palette <- c("detected in all" = "#253494",
                        "detected in many" = "#2c7fb8",
                        "detected in some" = "#41b6c4",
                        "detected in single" = "#a1dab4",
                        "not detected " = "grey")

ordered_names_distr <- classification_tissue %>%  
  filter(dist_category %in% c("detected in single",
                              "detected in some"
                              )) %>% 
  separate_rows(tissues_detected, sep = ";") %>% 
  group_by(dist_category, tissues_detected) %>% 
  count() %>% 
  group_by(tissues_detected) %>% 
  summarise(sum = sum(n)) %>% 
  arrange(sum) %>% 
  .$tissues_detected

classification_tissue %>%  
  filter(
    dist_category %in% c(
      "detected in single",
      "detected in some"
    )
  ) %>%
  separate_rows(tissues_detected, sep = ";") %>%
  group_by(dist_category, tissues_detected) %>%
  count() %>%
  mutate(tissues_detected = factor(tissues_detected, ordered_names_distr)) %>%
  mutate(dist_category = factor(
    dist_category,
    c(
      "detected in single",
      "detected in some",
      "detected in many",
      "detected in all"
    )
  )) %>%
  ggplot(aes(x = n, y = tissues_detected, fill = dist_category)) +
  geom_col() +
  scale_fill_manual(values = detection_palette) +
  theme_classic() + theme(
    axis.title.y = element_blank(),
    axis.line.y = element_blank(),
    axis.title.x = element_blank(),
    legend.position = "bottom",
    legend.title = element_blank()
  ) +
  scale_x_continuous(expand = expansion(mult = 0, add = 0)) +
  ggtitle("Number of detected genes per tissue type")  +
 # ggtitle("Number of detected genes detected in a single tissue")  +
  guides(fill = guide_legend(nrow = 2, byrow = TRUE))

#Figure 4 ##Figure 4A - Pie charts at grouped tissue level

gene_category_pal <- c("Detected in all" = "#253494",
                       "Detected in many" = "#2c7fb8",
                       "Detected in some" = "#41b6c4",
                       "Detected in single" = "#a1dab4",
                       "Not detected " = "  #bebebe",
                       "Low tissue specificity" = "grey40",
                       "Tissue enhanced" = "#984ea3",
                       "Group enriched" = "#FF9D00",
                       "Tissue enriched" = "#e41a1c")

plot_data <-
  classification_consensus %>% 
  rename(specificity = spec_category, distribution = dist_category) %>% 
  select(gene, specificity, distribution) %>% 
  gather(class_type, class, -1) %>% 
  group_by(class_type, class) %>% 
  count() %>%
  group_by(class_type) %>% 
  mutate(perc = 100 * n / sum(n),
         label = paste0(n, "\n", round(perc, 1), "%")) %>% 
  ungroup() %>% 
  mutate(class = factor(str_to_sentence(class), str_to_sentence(c("tissue enriched", "group enriched",  "tissue enhanced", "low tissue specificity", "detected in all", "detected in many", "detected in some", "detected in single", "not detected"))),
         class_type = factor(str_to_sentence(class_type),
                             c("Specificity", "Distribution")))
    
plot_dodge = 0.1
plot_data %>% 
  arrange(match(class, rev(levels(class)))) %>% 
  group_by(class_type) %>% 
  mutate(y_stack = cumsum(n) - n/2) %>% 
  {ggplot(., aes(1, n, fill = class, group = class, 
                 label = label)) +
      geom_col(show.legend = F, 
               color = "white", 
               width = 1) + 
      geom_segment(aes(x = 1.5 + plot_dodge, xend = 1.5, 
                       y = y_stack, yend = y_stack), size = 0.5) +
      geom_text_repel(aes(x = 1.5 + plot_dodge, y = y_stack), 
                      color = "black", nudge_x = plot_dodge, 
                      segment.size = 0.5, size = 24/11) +
      scale_fill_manual(values = gene_category_pal) + 
      facet_wrap(~class_type) +
      coord_polar("y",start = 0) +
      theme_void() + 
      scale_x_continuous(expand = expansion(c(0,0.8)))}


ggsave("./final_plots/classification/class_pies_grouped_tissue.pdf",width = 6, height = 5)

##Figure 4B - Speceficity for each grouped tissue

classification_consensus %>% group_by(spec_category) %>% count() 

ordered_names_sp <-
  classification_consensus %>%
  filter(spec_category %in% c("group enriched", "tissue enriched", "tissue enhanced")) %>%
  separate_rows(enriched_tissues, sep = ";") %>%
  group_by(spec_category, enriched_tissues) %>%
  count() %>%
  ungroup() %>% 
  group_by(enriched_tissues) %>%
  summarise(sum = sum(n)) %>%
  ungroup() %>% 
  arrange(sum) %>%
  .$enriched_tissues %>% 
  str_to_sentence()
  
  
  

specificity_palette <- rev(c("Not detected" = "grey",
                           "Low tissue specificity" = "grey40",
                           "Tissue enhanced" = "#984ea3",
                           "Group enriched" = "#FF9D00",
                           "Tissue enriched" = "#e41a1c"))

classification_consensus %>%
  filter(spec_category %in% c("group enriched", "tissue enriched", "tissue enhanced")) %>%
  separate_rows(enriched_tissues, sep = ";") %>%
  group_by(spec_category, enriched_tissues) %>%
  count() %>%
  mutate(enriched_tissues = factor(str_to_sentence(enriched_tissues), ordered_names_sp)) %>%
  mutate(spec_category = factor(str_to_sentence(spec_category), c("Tissue enriched", "Group enriched",  "Tissue enhanced"))) %>%
  ggplot(aes(x = n, y = enriched_tissues, fill = spec_category)) +
  geom_col() +
  scale_fill_manual(values = specificity_palette) +
  xlab("Number of genes")+
  theme_classic() + theme(
    axis.title.y = element_blank(),
  #  axis.title.x = element_blank(),
    axis.line.y = element_blank(),
    legend.position = "bottom",
    legend.title = element_blank()
  ) +
  scale_x_continuous(expand = expansion(mult = 0, add = 0)) +
  ggtitle("Specificity category per consensus tissue") 

ggsave("./final_plots/classification/consensus_tissue_specificity.pdf", height = 7, width = 5.5)

##Figure 4C - Specificity and Tau score



specificity_palette <- rev(c("Not detected" = "grey",
                           "Low tissue specificity" = "grey40",
                           "Tissue enhanced" = "#984ea3",
                           "Group enriched" = "#FF9D00",
                           "Tissue enriched" = "#e41a1c"))

p1 <- classification_consensus %>% 
  mutate(spec_category = factor(str_to_sentence( spec_category), levels = names(specificity_palette)),
         enriched_tissues = str_to_sentence(enriched_tissues)) %>% 
  filter(spec_category != "Not detected") %>% 
  ggplot(aes(x = tau_score, y = spec_category, fill = spec_category)) +
  geom_violin() +
  scale_fill_manual(values = gene_category_pal, name = "Specificity") +
  xlab("Tau score") +
  theme_bw() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
    axis.title.y = element_blank(), 
    axis.text.y = element_blank(), 
    axis.ticks.y = element_blank()
    ) 

#ggsave("./final_plots/classification/tau_to_specificity.pdf",width = 5.5, height = 4)


p2 <- classification_consensus %>%
  ggplot(aes(tau_score)) +
  geom_histogram(bins = 100) +
  theme_classic() +
  ylab("Count")+
  theme(panel.background = element_rect("gray90"),
        #axis.title.y = element_blank(),
        axis.text.x = element_blank(),
        axis.title.x = element_blank()) +
  scale_y_continuous(expand = expansion(mult = 0, add = 0))

p2/ p1 + plot_layout(heights = c(1, 3))

ggsave("./final_plots/classification/specificity_and_dendrogram_consensus_w_overall.pdf",width = 7, height = 6)

##Figure 4D - Gene annotation alluvial plot

width = 0.1

alluv_1 <-
  
  classification_consensus %>%
  mutate(Specificity = str_to_sentence(spec_category),
          Distribution = str_to_sentence(dist_category)) %>%
  select(Specificity,
         Distribution) %>%
  mutate(row_n = row_number()) %>%
  gather(bar, chunk, -row_n) %>%
  mutate(color_vars = 1) %>%
  group_by(row_n) %>%
  mutate(chunk_color = chunk[match(c("Specificity",
                                     "Distribution")[color_vars], bar)]) %>% 
  ungroup() %>%
  
  
  mutate(chunk = factor(chunk, levels = c('Tissue enriched', 'Group enriched', 
                                          'Tissue enhanced', 'Low tissue specificity', 
                                          'Detected in single',
                                          'Detected in some', 
                                          'Detected in many', 
                                          'Detected in all',
                                          'Not detected')),
         bar = factor(bar, levels = c("Specificity",
                                      "Distribution"))) %>%
  
  
  ggplot(aes(x = bar, stratum = chunk, alluvium = row_n,
             y = 1)) +
  
  geom_flow(aes(fill = chunk_color), 
            show.legend = F, width = width,
            knot.pos = 1/6) +
  geom_stratum(aes(fill = chunk), 
               show.legend = F, color = NA, width = width) +
  
  scale_x_discrete(expand = c(.1, .1), position = "top") +
  scale_fill_manual(values = c(gene_category_pal)) + 
  
  
  theme(axis.text.y = element_text(size = 18, face = "bold"),
        axis.text.x = element_blank(), 
        axis.ticks = element_blank(), 
        panel.background = element_blank(), 
        axis.title = element_blank()) +
  coord_flip()

# alluv_1

flow_data <-
  ggplot_build(alluv_1)$data[[1]] %>%
  as_tibble() %>%
  {
    if ("side" %in% names(.)) {
      .
    } else{
      mutate(.,
             side = case_when(flow == "from" ~ "start",
                              flow == "to" ~ "end"))
    }
  }



stratum_data <- 
  ggplot_build(alluv_1)$data[[2]]

flow_data_labels <-
  flow_data %>%
  as_tibble() %>%
  select(x, stratum, group, side, ymin, ymax) %>%
  pivot_wider(names_from = side,
              values_from = c(x, stratum, ymin, ymax)) %>%
  mutate_at(
    c(
      "x_end",
      "ymax_end",
      "ymin_end",
      "x_start",
      "ymax_start",
      "ymin_start"
    ),
    as.numeric
  ) %>%
  group_by(stratum_start, stratum_end, x_start, x_end) %>%
  summarise(
    y_end = (min(ymin_end) + max(ymax_end)) / 2,
    y_start = (min(ymin_start) + max(ymax_start)) / 2,
    size = max(ymax_start) - min(ymin_start)
  )
`summarise()` has grouped output by 'stratum_start', 'stratum_end', 'x_start'. You can override using the `.groups` argument.
alluv_2 <- 
  alluv_1 +
  geom_text(data = flow_data_labels,
            aes(x = x_start + width/2,
                y = y_start, 
                label = size), 
            inherit.aes = F, 
            size = 3,
            angle = -90,
            hjust = 1,
            vjust = 0.5) +
  geom_text(data = flow_data_labels,
            aes(x = x_end - width/2,
                y = y_end, 
                label = size), 
            inherit.aes = F, 
            size = 3, 
            angle = -90,
            hjust = 0,
            vjust = 0.5) +
  
  # Stratum label
  
  geom_text(data = stratum_data %>%
              filter(x == 1),
            aes(x = x - width/2,
                y = y,
                label = stratum),
            size = 4, 
            vjust = 1.5,
            inherit.aes = F) + 
  geom_text(data = stratum_data %>%
              filter(x == 2),
            aes(x = x + width/2,
                y = y,
                label = stratum),
            size = 4, 
            vjust = -0.5,
            inherit.aes = F) + 
  
  geom_text(data = stratum_data,
            aes(x = x, 
                y = y,
                label = ymax - ymin), 
            size = 4, 
            fontface = "bold",
            color = "white",
            inherit.aes = F)


alluv_2
ggsave("./final_plots/classification/alluvial_classification.pdf",width = 8, height = 3)

###Extra figures not in report

organ_colors <- metadata %>% select(consensus_tissue_name, consensus_tissue_color) %>% unique()
pal <-  organ_colors$consensus_tissue_color
pal <- setNames(pal, str_to_sentence(organ_colors$consensus_tissue_name))


specificity_palette <- rev(c("Not detected" = "grey",
                           "Low tissue specificity" = "grey40",
                           "Tissue enhanced" = "#984ea3",
                           "Group enriched" = "#FF9D00",
                           "Tissue enriched" = "#e41a1c"))


class_table_temp <-
  classification_consensus %>%
  select(gene, spec_category, enriched_tissues) %>%
  separate_rows(enriched_tissues, sep = ";") %>%
  mutate(spec_category = factor(str_to_sentence( spec_category), levels = names(specificity_palette)),
         enriched_tissues = str_to_sentence(enriched_tissues))

plot_dendro <-
  tmm_consensus_tissue %>% 
  select(target_id, consensus_tissue_name, tmm) %>% 
  mutate(consensus_tissue_name = str_to_sentence(consensus_tissue_name)) %>%
  spread(target_id, tmm) %>%
  column_to_rownames("consensus_tissue_name")  %>%
  t() %>%
  cor(method = "spearman") %>%
  
  {1 - .} %>%
  as.dist() %>% 
  hclust(method = "average") %T>%
  plot %>%
  dendro_data()

  
  
dendro_plot_data <- 
  left_join(plot_dendro$segments, 
            plot_dendro$labels, 
            by = c("x" = "x", "yend" = "y")) 

left_plot <- 
  dendro_plot_data %>%
  ggplot() +
  geom_segment(aes(x=y, y=x, xend=yend, yend=xend, group = label))+
  geom_rect(aes(xmin=0, ymin=x + 0.5, 
                xmax=-0.02, ymax=xend - 0.5, 
                fill = label), 
            show.legend = F) +
  scale_color_manual(values = pal)+
  scale_fill_manual(values = pal)+
  scale_x_reverse(expand = expansion(0), position = "top")+
  scale_y_continuous(expand = expansion()) +
  xlab("1 - Spearman's rho") +
  
  theme(axis.text.y = element_blank(), 
        axis.title.y = element_blank(), 
        axis.ticks.y = element_blank(),
        plot.margin = unit(c(1,1,1,1), units = "mm"), 
        panel.background = element_blank()) 

right_plot <- 
  class_table_temp %>%
  filter(!is.na(enriched_tissues)) %>%
  group_by(enriched_tissues, spec_category) %>%
  summarise(n_genes = n()) %>%
  ungroup() %>%
  mutate(enriched_tissues = factor(enriched_tissues, levels = plot_dendro$labels$label),
         spec_category = factor(spec_category, names(specificity_palette))) %>%
  ggplot(aes(n_genes, enriched_tissues, fill = spec_category)) +
  geom_col(width = 0.8, size = 0.1) +
  theme_bw() + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  scale_fill_manual(values = gene_category_pal, name = "Specificity") +
  # coord_flip() +
  xlab("Number of genes") +
  
  scale_x_continuous(position = "top", expand = expansion(c(0, 0.1))) + 
  scale_y_discrete(expand = expansion()) +
  
  theme(axis.text.y = element_text(hjust = 0.5), 
        legend.position = c(0.7, 0.5),
        axis.title.y = element_blank(),
        panel.border = element_blank())
`summarise()` has grouped output by 'enriched_tissues'. You can override using the `.groups` argument.
left_plot + right_plot +
  plot_layout(widths = c(0.3, 1))


ggsave("./final_plots/classification/specificity_and_dendrogram_consensus.pdf",width = 7, height = 7)

#Figure 5

##Plot based on Max Karlsson's network plot

classification_network_plot_2 <- 
  function(class_table, gene_col, spec_col, enriched_col, spec_filter, pal, savename, enriched_sep = ";", 
           node_filter_rank = 2, node_filter_show_cat = c("tissue enriched"), 
           node_filter_min = 2, node_filter_n_show = 8, scale_factor = 1) {
    
    enrichment_table <- 
      class_table %>% 
      select(gene = gene_col, 
             spec = spec_col, 
             enriched = enriched_col) %>% 
      filter(spec %in% spec_filter) %>% 
      group_by(enriched, spec) %>% 
      summarise(n_genes = n()) %>%
      ungroup()
    
    net_data <-
      enrichment_table %>% 
      mutate(all_enriched = enriched) %>%
      separate_rows(enriched, sep = enriched_sep) %>%
      group_by(enriched, spec) %>% 
      mutate(rank = rank(-n_genes, ties.method = "min")) %>%
      group_by(all_enriched) %>%
      mutate(any_low_rank = any(rank <= node_filter_rank)) %>%
      ungroup() %>%
      
      filter((spec == node_filter_show_cat | any_low_rank | n_genes >= node_filter_n_show) &
               (spec == node_filter_show_cat | n_genes >= node_filter_min)) %>% 
      mutate(edge_id = paste("enriched:", all_enriched)) %>% 
      arrange(n_genes)
    
    net_edges <- 
      net_data %$% 
      tibble(node1 = enriched, node2 = edge_id, n = n_genes) %>% 
      unique()
    
    g <-
      net_edges %>%
      graph_from_data_frame(directed = FALSE) %>%
      ggraph(layout = "kk") 
    
    link_map <- 
      net_edges %>% 
      gather(node, id, -(3)) %>% 
      mutate(tissue_node = node == "node1", 
             color_id = case_when(tissue_node ~ id, 
                                  grepl(";", id) ~ "Group enriched",
                                  !grepl(";", id) ~ "Tissue enriched"), 
             label = ifelse(tissue_node, color_id, n)) %>%
      select(n, node, id, tissue_node, color_id, label) %>%
      unique()
    
    
    edge_data <- get_edges()(g$data)
    node_data <- 
      get_nodes()(g$data) %>% 
      as_tibble() %>%
      left_join(link_map, 
                by = c("name" = "id")) 
    
    
    fig <- 
      g + 
      geom_edge_arc(aes(width = n), 
                    color = "gray", 
                    strength = 0, 
                    alpha = 0.5,
                    show.legend = F) + 
      scale_edge_alpha_continuous(range = c(0.3, 1)) +
      scale_edge_width_continuous(range = c(1, 3)) +
      
      geom_node_point(data = node_data  %>%
                        filter(!tissue_node),
                      aes(size = log10(n), 
                          fill = color_id), 
                      stroke = 1,
                      # size = 10,
                      shape = 21,
                      show.legend = F)+
      geom_node_point(data = node_data %>%
                        filter(tissue_node),
                      aes(fill = color_id), 
                      stroke = 1,
                      size = 20 * scale_factor,
                      shape = 21,
                      show.legend = F)+
      geom_node_text(data = node_data,
                     aes(label = label),
                     size = 4 * scale_factor) +
      scale_size_continuous(range = c(5, 10) * scale_factor) +
      scale_fill_manual(values = pal) +
      
      theme_void()
    
    ## ----- Save
    
    
    cyto_summary <- 
      net_edges %>% 
      mutate(category = ifelse(!grepl(enriched_sep, node2), "Tissue enriched", "Group enriched"), 
             node_id = unclass(factor(node2)),
             node1 = str_to_sentence(node1), 
             n_sqrt = sqrt(n), 
             str_len = str_length(node1)) %>%
      select(category, node1, node2, node_id, n, n_sqrt, str_len) 
    
    cyto_summary %>% 
      write_delim("cytoscape nodes summary.txt", delim = "\t")
  #    write_delim(savepath(paste(savename, "cytoscape nodes summary.txt")), delim = "\t")
    
    bind_rows(cyto_summary %>% 
                left_join(pal %>% 
                            enframe("node1", "color")) %>% 
                select(node_id = node1, 
                       color) %>% 
                unique() %>%
                mutate(node_type = "Tissue"),
              cyto_summary %>% 
                mutate(color = case_when(category == "Tissue enriched" ~ "#e41a1c",
                                         category == "Group enriched" ~ "#FF9D00"), 
                       node_id = as.character(node_id)) %>%
                select(node_id, color) %>% 
                unique() %>%
                
                mutate(node_type = "Enrichment")) %>%
      mutate(color2 = case_when(node_type == "Enrichment" ~ color,
                                node_type == "Tissue" ~ "#D3D3D3FF"),
             color3 = case_when(node_type == "Enrichment" ~ color,
                                node_type == "Tissue" ~ "#BEBEBEFF")) %>% 
      
      write_delim("cytoscape nodes color.txt", delim = "\t") 
      #write_delim(savepath(paste(savename, "cytoscape nodes color.txt")), delim = "\t") 
    
    bind_rows(cyto_summary %>% 
                select(node_id = node1) %>% 
                mutate(label = node_id) %>%
                unique(),
              cyto_summary %>% 
                mutate(node_id = as.character(node_id), 
                       label = as.character(n)) %>%
                select(node_id, label) %>% 
                unique()) %>% 
      write_delim("cytoscape nodes label whole group.txt",
      #write_delim(savepath(paste(savename, "cytoscape nodes label whole group.txt")), 
                  delim = "\t")
    
    ## ----
    
    fig
  }


organ_colors <- metadata %>% select(consensus_tissue_name, organ_color) %>% unique()
pal1 <-  organ_colors$organ_color
pal1 <- setNames(pal1, organ_colors$consensus_tissue_name)

specificity_palette <- rev(c("Not detected" = "grey",
                             "Tissue" = "grey",
                           "Low tissue specificity" = "grey40",
                           "Tissue enhanced" = "#984ea3",
                           "Group enriched" = "#FF9D00",
                           "Tissue enriched" = "#e41a1c"))

library(influential)

classification_network_plot_2(
  class_table = classification_consensus %>% mutate(spec_category = str_to_sentence(spec_category)),
  gene_col = "gene",
  spec_col = "spec_category",
  enriched_col = "enriched_tissues",
  spec_filter = c("Tissue enriched", "Group enriched"),
  pal = c(pal1, specificity_palette),
  savename = "test_interconsensus",
  enriched_sep = ";",
  node_filter_rank = 2,
  node_filter_show_cat = c("Tissue enriched"),
  node_filter_min = 2,
  node_filter_n_show = 5,
  scale_factor = 0.8
) 
`summarise()` has grouped output by 'enriched'. You can override using the `.groups` argument.Joining, by = "node1"
ggsave("./final_plots/data_presentation/nework_plot2-filter.pdf", height = 20, width = 20)

#Read Comparison data

comp_metadata <- read_csv("./data/final_data/comparison_metadata-init-1-0.csv")
Rows: 124 Columns: 14── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (13): tissue_name, region_tissue_name, consensus_tissue_name, organ_name, tissue_color, region_tissue_color, co...
lgl  (1): regional_tissue
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
human_atlas_tissue <-
  read_delim("./data/human_hpa/rna_tissue_consensus.tsv", delim = "\t") %>%
  mutate(source = "hpa_consensus") %>%
  group_by(Gene) %>%
  mutate(nx = case_when(nTPM == 0 ~ 0,
                        T ~ nTPM / sqrt(sd(nTPM)))) %>%
  ungroup() #%>% 
Rows: 1084208 Columns: 4── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (3): Gene, Gene name, Tissue
dbl (1): nTPM
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
  # # just necessary if read data of multiple different sources, then would make sense to average out samples named the same
  # # just from one source atm, so no need for this.
  # group_by(Tissue, Gene) %>%
  # summarise(nTPM = mean(nTPM, na.rm = T)) %>% 
  # ungroup()

hpa_comp <- human_atlas_tissue %>% left_join(
  comp_metadata %>%
    select(hpa_consensus_name, comparison_tissue_name) %>%
    distinct() %>%
    filter(!is.na(hpa_consensus_name)) %>%
    filter(!is.na(comparison_tissue_name)),
  by = c("Tissue" = "hpa_consensus_name")) %>% 
  filter(!is.na(comparison_tissue_name)) %>% 
  group_by(Gene,comparison_tissue_name) %>% 
  summarise(tmm = max(nTPM),
            nx = max(nx)) %>% 
  ungroup() %>% 
  rename(tissue = comparison_tissue_name)
`summarise()` has grouped output by 'Gene'. You can override using the `.groups` argument.
rat_tissue <-
  read_csv("./data/final_data/final_tmm_tissue_name.csv") %>% 
  # #omit gather, data should be  already in long format
  # gather(sample, tmm,-1) %>%
  group_by(target_id) %>%
  mutate(nx = case_when(tmm == 0 ~ 0,
                        T ~ tmm / sqrt(sd(tmm)))) %>%
  ungroup()
Rows: 2224500 Columns: 3── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): target_id, tissue_name
dbl (1): tmm
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
  

# select tissues and pool tissues that will get compared
rat_tissue_comp <- rat_tissue %>% left_join(comp_metadata %>% 
                                              select(tissue_name, comparison_tissue_name), by = c("tissue_name" = "tissue_name")) %>%
  filter(!is.na(comparison_tissue_name)) %>% 
  group_by(target_id, comparison_tissue_name) %>% 
  summarise(tmm = max(tmm),
            nx = max(nx)) %>% 
  ungroup() %>% 
  rename(tissue = comparison_tissue_name)
`summarise()` has grouped output by 'target_id'. You can override using the `.groups` argument.
#check if all have the same tissue
all(rat_tissue_comp %>% distinct(tissue) == hpa_comp %>% distinct(tissue) )
[1] TRUE

#Read ortholog data

#Select either "HPA" or "ensemble"
ortholog_data_from <- "HPA"
#ortholog_data_from <- "ensemble"

include_many2many <- TRUE
include_one2many <- TRUE

#Only needed for ensembl:
#define ensemble version
version     <- 103
organism_db <- "rnorvegicus_gene_ensembl"
#mart <- useEnsembl ( biomart="genes" , dataset=organism_db , version=version )

if (ortholog_data_from == "HPA"){
  homolog_data <- read.table("./data/human_hpa/kalle/ensembl_rat_ortholog.tsv", sep = "\t", header = TRUE) %>% 
    filter(ensrnog_id %in% rat_tissue$target_id) %>% 
    filter(ensg_id %in% human_atlas_tissue$Gene)
} else if (ortholog_data_from == "ensemble"){
  homolog_data <- getBM( attributes=c( "ensembl_gene_id", "hsapiens_homolog_ensembl_gene", "hsapiens_homolog_orthology_type") , mart=mart ) %>%
    filter(hsapiens_homolog_ensembl_gene != "") %>% 
    filter(ensembl_gene_id %in% rat_tissue$target_id) %>% 
    filter(hsapiens_homolog_ensembl_gene %in% human_atlas_tissue$Gene) %>% 
    rename(ensrnog_id = ensembl_gene_id, ensg_id = hsapiens_homolog_ensembl_gene, ortholog_type = hsapiens_homolog_orthology_type)
}

if (include_many2many == FALSE){
  homolog_data <- homolog_data %>% filter(!ortholog_type == "ortholog_many2many")
}
if (include_one2many == FALSE){
  homolog_data <- homolog_data %>% filter(!ortholog_type == "ortholog_one2many")
}

#Figure 6

comparison_colors_tbl <- rbind(
  comp_metadata %>% select(name = comparison_tissue_name, color = consensus_tissue_color) %>% distinct(), 
  comp_metadata %>% select(name = tissue_name, color = tissue_color) %>%  distinct()
  ) %>% 
  distinct() %>% 
  drop_na() %>% 
  mutate(name = str_to_sentence(name))

pal <- comparison_colors_tbl$color
pal <- setNames(pal, str_to_sentence(comparison_colors_tbl$name))


plot_data1 <- 
  comp_metadata %>% 
  select(tissue_name, comparison_tissue_name, organ_name) %>% 
  unique() %>% 
  drop_na() %>% 
  mutate(tissue_name = str_to_sentence(tissue_name), comparison_tissue_name = str_to_sentence(comparison_tissue_name)) %>% 
  arrange(organ_name, comparison_tissue_name) %>% 
  mutate(plot_order = row_number()) %>% select(-organ_name)

plot_data2 <- 
  plot_data1 %>%
  gather(column, label, -plot_order) %>%
  group_by(label, column) %>% 
  summarise(plot_order = mean(plot_order))  %>%
  ungroup() %>% 
  mutate(label = label,
         column = factor(column,
                         c("tissue_name", 
                           "comparison_tissue_name"
                           )))
`summarise()` has grouped output by 'label'. You can override using the `.groups` argument.
plot_data3 <- 
  plot_data1 %>% 
  group_by(comparison_tissue_name) %>% 
  mutate(left_pos = mean(plot_order))

plot_data4 <- 
  plot_data2 %>% 
  left_join(comparison_colors_tbl,
            by = c("label" = "name")) %>% 
  group_by(column, label) %>% 
  summarise(miny = min(plot_order) - 0.5,
            maxy = max(plot_order) + 0.5)
`summarise()` has grouped output by 'column'. You can override using the `.groups` argument.
ggplot() +
  geom_rect(data = plot_data4, 
           aes(xmin = column, xmax = column, ymin = miny, ymax =  maxy, fill = label), 
           show.legend = F
         ) +
    geom_rect(data = plot_data4 %>% filter (column == "comparison_tissue_name"), 
          # aes(xmin = column, xmax = column, ymin = miny, ymax = maxy, color = color), 
           aes(xmin = 2, xmax = 2.5, ymin = miny, ymax =  maxy, fill = label), 
           show.legend = F
         ) +
    geom_rect(data = plot_data4 %>% filter (column == "tissue_name"), 
          # aes(xmin = column, xmax = column, ymin = miny, ymax = maxy, color = color), 
           aes(xmin = 0.5, xmax = 1, ymin = miny, ymax =  maxy, fill = label), 
           show.legend = F, width = 10
         ) +
  geom_segment(
    data = plot_data3,
    aes(
      x = "comparison_tissue_name",
      xend = "tissue_name",
      y = left_pos,
      yend = plot_order,
      color = tissue_name
    ),
    show.legend = F,
    alpha = 0.5,
    size = 2
  )+
  geom_text(
    data = plot_data2 %>% filter(column == "comparison_tissue_name"),
    aes(x = column, y = plot_order, label = label),
    hjust = 0,
    size = 2 * 5 / 6,
    label.padding = unit(0, "mm")
  ) +
  geom_text(
    data = plot_data2 %>% filter(column == "tissue_name"),
    aes(x = column, y = plot_order, label = label),
    hjust = 1,
    size = 2 * 5 / 6,
    show.legend = F,
    label.padding = unit(0, "mm")
  ) +
  # geom_label(
  #   data = plot_data2 %>%
  #     filter(column == "organ_name"),
  #   #aes(x = column, y = plot_order, label = label),
  #   aes(x = column, y = plot_order, label = gsub(" ", "\n", label)),
  #   show.legend = F,
  #   label.size = 0,
  #   hjust = 1,
  #   lineheight = 0.7,
  #   label.padding = unit(0, "mm"),
  #   size = 2 * 5 / 6
  # ) +
  scale_y_reverse() +
  scale_x_discrete(labels  = c("Rat tissue", "Comparison tissue"),position = "top") +
  scale_fill_manual(values = pal) +
  scale_color_manual(values = pal) +
  theme_void() +
  theme(axis.text.x = element_text())
Warning: Ignoring unknown parameters: `width`Warning: Ignoring unknown parameters: `label.padding`Warning: Ignoring unknown parameters: `label.padding`
ggsave("final_plots/comparison/comparison_tissues_rat.pdf", height = 5, width = 4)

comparison_colors_tbl <- rbind(
  comp_metadata %>% select(name = comparison_tissue_name, color = consensus_tissue_color) %>% distinct(), 
  comp_metadata %>% select(name = hpa_consensus_name, color = tissue_color) %>%  distinct()
  ) %>% 
  distinct() %>% 
  drop_na() %>% 
  mutate(name = str_to_sentence(name))

pal <- comparison_colors_tbl$color
pal <- setNames(pal, str_to_sentence(comparison_colors_tbl$name))


plot_data1 <- 
  comp_metadata %>% 
  select(hpa_consensus_name, comparison_tissue_name, organ_name) %>% 
  unique() %>% 
  drop_na() %>% 
  mutate(hpa_consensus_name = str_to_sentence(hpa_consensus_name), comparison_tissue_name = str_to_sentence(comparison_tissue_name)) %>% 
  arrange(organ_name, comparison_tissue_name) %>% 
  mutate(plot_order = row_number()) %>% select(-organ_name)

plot_data2 <- 
  plot_data1 %>%
  gather(column, label, -plot_order) %>%
  group_by(label, column) %>% 
  summarise(plot_order = mean(plot_order))  %>%
  ungroup() %>% 
  mutate(label = label,
         column = factor(column,
                         c("comparison_tissue_name", "hpa_consensus_name"
                           )))
`summarise()` has grouped output by 'label'. You can override using the `.groups` argument.
plot_data3 <- 
  plot_data1 %>% 
  group_by(comparison_tissue_name) %>% 
  mutate(left_pos = mean(plot_order))

plot_data4 <- 
  plot_data2 %>% 
  left_join(comparison_colors_tbl,
            by = c("label" = "name")) %>% 
  group_by(column, label) %>% 
  summarise(miny = min(plot_order) - 0.5,
            maxy = max(plot_order) + 0.5)
`summarise()` has grouped output by 'column'. You can override using the `.groups` argument.
ggplot() +
  geom_rect(data = plot_data4, 
           aes(xmin = column, xmax = column, ymin = miny, ymax =  maxy, fill = label), 
           show.legend = F
         ) +
    geom_rect(data = plot_data4 %>% filter (column == "hpa_consensus_name"), 
          # aes(xmin = column, xmax = column, ymin = miny, ymax = maxy, color = color), 
           aes(xmin = 2, xmax = 2.5, ymin = miny, ymax =  maxy, fill = label), 
           show.legend = F
         ) +
    geom_rect(data = plot_data4 %>% filter (column == "comparison_tissue_name"), 
          # aes(xmin = column, xmax = column, ymin = miny, ymax = maxy, color = color), 
           aes(xmin = 0.5, xmax = 1, ymin = miny, ymax =  maxy, fill = label), 
           show.legend = F, width = 10
         ) +
  geom_segment(
    data = plot_data3,
    aes(
      x = "comparison_tissue_name",
      xend = "hpa_consensus_name",
      y = left_pos,
      yend = plot_order,
      color = hpa_consensus_name
    ),
    show.legend = F,
    alpha = 0.5,
    size = 2
  )+
  geom_text(
    data = plot_data2 %>% filter(column == "hpa_consensus_name"),
    aes(x = column, y = plot_order, label = label),
    hjust = 0,
    size = 2 * 5 / 6,
    label.padding = unit(0, "mm")
  ) +
  geom_text(
    data = plot_data2 %>% filter(column == "comparison_tissue_name"),
    aes(x = column, y = plot_order, label = label),
    hjust = 1,
    size = 2 * 5 / 6,
    show.legend = F,
    label.padding = unit(0, "mm")
  ) +
  # geom_label(
  #   data = plot_data2 %>%
  #     filter(column == "organ_name"),
  #   #aes(x = column, y = plot_order, label = label),
  #   aes(x = column, y = plot_order, label = gsub(" ", "\n", label)),
  #   show.legend = F,
  #   label.size = 0,
  #   hjust = 1,
  #   lineheight = 0.7,
  #   label.padding = unit(0, "mm"),
  #   size = 2 * 5 / 6
  # ) +
  scale_y_reverse() +
  scale_x_discrete(labels  = c("Comparison tissue", "Human Tissue"),position = "top") +
  scale_fill_manual(values = pal) +
  scale_color_manual(values = pal) +
  theme_void() +
  theme(axis.text.x = element_text())
Warning: Ignoring unknown parameters: `width`Warning: Ignoring unknown parameters: `label.padding`Warning: Ignoring unknown parameters: `label.padding`
ggsave("final_plots/comparison/comparison_tissues_human_rev.pdf", height = 5, width = 4)

#Batch Correction


#Do batch correction
#join rat and human tmm data together
joined_atlas_comparison_temp <- rat_tissue_comp %>%
  inner_join(homolog_data, by = c("target_id" = "ensrnog_id")) %>%
  unite(mutual_id, target_id, ensg_id, sep = "_") %>%
  mutate(species = "rat") %>%
  select(mutual_id, tissue, species , tmm) %>%
  bind_rows(
    hpa_comp %>%
      inner_join(homolog_data, by = c("Gene" = "ensg_id")) %>%
      unite(mutual_id, ensrnog_id, Gene, sep = "_") %>%
      mutate(species = "human") %>%
      select(mutual_id, tissue, species , tmm)
  )
  #### if tmm is under 0 then it is equal to 0
  #mutate(tmm = ifelse(tmm < 1, 0, tmm ))

#long table with inf of on tissue and species
joined_atlas_comparison_temp_2 <- joined_atlas_comparison_temp %>% 
  unite(id, tissue, species, sep = "_") %>% 
  separate(id, into = c("tissue", "species"), sep = "_", remove = F) 

#wide table only with tmm
joined_atlas_comparison_temp3_tmm <-
  joined_atlas_comparison_temp_2 %>% 
  select(mutual_id, id, tmm) %>% 
  spread(id, tmm) %>% 
  column_to_rownames("mutual_id") 

#log scale
joined_atlas_comparison_temp3_limma <-
  joined_atlas_comparison_temp3_tmm %>%
  {log10(. + 1)} %>%
  limma::removeBatchEffect(batch = colnames(joined_atlas_comparison_temp3_tmm) %>%
                             str_extract("_.*$"))
#put them together in the long format
joined_atlas_comparison<- 
  joined_atlas_comparison_temp_2 %>%
  left_join(joined_atlas_comparison_temp3_tmm %>% 
              as_tibble(rownames = "mutual_id") %>% 
              gather(id, tmm, -1)) %>% 
  left_join(joined_atlas_comparison_temp3_limma %>% 
              as_tibble(rownames = "mutual_id") %>% 
              gather(id, limma_log1p_tmm, -1)) 
Joining, by = c("mutual_id", "id", "tmm")Joining, by = c("mutual_id", "id")

#Figure 7 ##Figure 7A - Cross species dendrogramm


hclust4RNAseq <- function(df, correlation_method = "spearman"){
  #wide dataframe as input 
  #to get correlation between samples, where rows are genes columns are samples
  #to get correlation between genes across samples, input df with genes as columns
  #can use later for dendogram making: ggdendrogram([hclust4RNAseq_results], rotate = FALSE, size = 10, face = "bold")
  similarity <- cor(df, method=correlation_method, use="pairwise.complete.obs")
  dissimilarity <- 1 - similarity
  hcl <- hclust(as.dist(dissimilarity), "average")
  return (hcl)
} 


organ_colors <- comp_metadata %>% select(comparison_tissue_name, consensus_tissue_color) %>% drop_na() %>% distinct()
pal <-  organ_colors$consensus_tissue_color
pal <- setNames(pal, str_to_sentence(organ_colors$comparison_tissue_name))

shape_def <- c(21, 22)
shape_def <- setNames(shape_def, c("Human", "Rat"))


plot_comp_dendro_data <- joined_atlas_comparison %>% 
  select(mutual_id, id, limma_log1p_tmm) %>% 
  spread(id, limma_log1p_tmm) %>% 
  column_to_rownames("mutual_id") %>% 
  cor(method = "spearman", use="pairwise.complete.obs") %>%
  {1 - .} %>%
  as.dist() %>% 
  hclust(method = "average") %T>%
  plot %>%
  dendro_data()


dendro_plot_data <- 
  left_join(plot_comp_dendro_data$segments, 
            plot_comp_dendro_data$labels, 
            by = c("x" = "x", "yend" = "y")) %>% 
  separate(label, c("tissue", "species"), sep = "_", remove = FALSE) %>% 
  mutate(tissue = str_to_sentence(tissue), species = str_to_sentence(species)) %>% 
  mutate(species = factor(species, c("Human", "Rat")))

left_plot <- 
  dendro_plot_data %>%
  ggplot() +
  geom_segment(aes(x=y, y=x, xend=yend, yend=xend))+
  # geom_rect(aes(xmin=0, ymin=x + 0.5, 
  #               xmax=-0.02, ymax=xend - 0.5, 
  #               fill = tissue), 
  #           show.legend = F) +
  geom_point(aes(
      x = 0,
      y = x,
      shape = species,
      fill = tissue
    ),
    color = "gray25",
    alpha = 1,
    size = 3,
    stroke = 0.7
  ) +
  geom_text(aes(x=-0.02, y=x,
                label = tissue),
            hjust = 0,
            show.legend = F) +
  scale_shape_manual(values = shape_def ) +
# scale_color_manual(values = pal) +
  scale_fill_manual(values = pal, guide = "none") +
   scale_x_reverse(
     expand = expansion(0.5 ), 
     position = "top")+
   scale_y_continuous(expand = expansion(0.01)) +
  xlab("1 - Spearman's rho") +
  
  theme(axis.text.y = element_blank(), 
        axis.title.y = element_blank(), 
        axis.ticks.y = element_blank(),
        legend.title=element_blank(),
        plot.margin = unit(c(1,1,1,1), units = "mm"), 
        panel.background = element_blank()) 
left_plot
ggsave("./final_plots/comparison/comparison_dendrogram.pdf", width = 6.5, height = 9 )

##Figure 7B - Cros species UMAP

library(plotly)
library(ggplotify)
library(geomtextpath)

comp_metadata <- read_csv("./data/final_data/comparison_metadata-init-1-0.csv")
Rows: 124 Columns: 14── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (13): tissue_name, region_tissue_name, consensus_tissue_name, organ_name, tissue_color, region_tissue_color, co...
lgl  (1): regional_tissue
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
umap_meta <- comp_metadata %>% select(consensus_tissue_color, organ_color, comparison_tissue_name) %>% filter(comparison_tissue_name != "" ) %>% distinct()
wide_data <- joined_atlas_comparison %>% 
  select(-tissue, -species, -tmm) %>% 
  spread(id, limma_log1p_tmm)
seed <- 42
filter_zero_sd = F
n_epochs = 1000 
n_neighbors = 15

pca_res <-
  wide_data %>%
  #no log1p because limma value is already calculated from log scale value
  column_to_rownames(colnames(wide_data)[1]) %>%
  scale() %>%
  t() %>%
  pca(nPcs = dim(.)[1])

pc_lim <-
  which(pca_res@R2cum > 0.8)[1]

pc_lim_sd <-
  rev(which(pca_res@sDev > 1))[1]

n_neighbors = 15
set.seed(seed)
umap_res <- pca_res@scores[, 1:pc_lim] %>%
  uwot::umap(n_neighbors = n_neighbors,
             n_epochs = n_epochs) %>%
  as_tibble() %>%
  set_names(paste0("UMAP", 1:ncol(.))) %>%
  mutate(sample = rownames(pca_res@scores)) %>%
  select(sample, everything()) %>%
  separate(
    sample,
    into = c("tissue", "species"),
    sep = "_",
    remove = F
  ) %>%
  left_join(umap_meta, by = c("tissue" = "comparison_tissue_name"))

organ_colors <-
  umap_meta %>% select(comparison_tissue_name, consensus_tissue_color) %>% unique()
pal <-  organ_colors$consensus_tissue_color
pal <- setNames(pal, organ_colors$comparison_tissue_name)

shape_def <- c(21, 22)
shape_def <- setNames(shape_def, c("human", "rat"))


# umap_res %>%  ggplot(aes(UMAP1, UMAP2,  color = organ_name)) +
#    geom_point(alpha = 0.8) + scale_color_manual(values = pal)
p <- umap_res %>%  
  ggplot(aes(UMAP1, UMAP2)) +
  geom_textpath(
    aes(label = str_to_sentence(tissue)),
    hjust = 0.5,
    vjust = -0.3,
    color = "gray20"
  ) +
  geom_point(
    aes(fill = tissue, shape = species),
    color = "gray25",
    alpha = 1,
    size = 2,
    stroke = 0.7
  ) +
  guides(fill = "none") +
  scale_fill_manual(values = pal) +
  scale_shape_manual(values = shape_def) +
  theme_classic() + theme(legend.text = element_text(size = 10),
                          #legend.title =element_blank(), 
                          legend.spacing.y = unit(-0.1, 'cm'),
                          legend.spacing.x = unit(-0.01, 'cm')) +
  guides(shape = guide_legend(ncol = 1, byrow = TRUE, title = "Species"))

p





if (include_many2many == TRUE & include_one2many == TRUE) {
  comp_umap_file_name = paste0("./final_plots/comparison/",
                               ortholog_data_from,
                               "_umap.pdf")
} else if (include_many2many == FALSE & include_one2many == TRUE) {
  comp_umap_file_name = paste0("./final_plots/comparison/",
                               ortholog_data_from,
                               "_umap.pdf")
} else if (include_many2many == FALSE &
           include_one2many == FALSE) {
  comp_umap_file_name = paste0("./final_plots/comparison/",
                               ortholog_data_from,
                               "_umap.pdf")
}


ggsave(comp_umap_file_name, height = 5.5, width = 6.5)

##Figure 7C - Cross species hypergeometric test

#Based on Max Karlsson: https://github.com/maxkarlsson/Pig-Atlas/blob/master/scripts/functions_classification.R

library(viridis)
library(ggsci)



class1 <- hpa_gene_classification(data = rat_tissue_comp %>% select(target_id, tissue, tmm), expression_col = "tmm", tissue_col = "tissue", gene_col = "target_id", enr_fold = 4, max_group_n = 5, det_lim = 1) %>% rename(ensrnog_id = gene)
class2 <- hpa_gene_classification(data = hpa_comp %>% select(Gene, tissue, tmm), expression_col = "tmm", tissue_col = "tissue", gene_col = "Gene", enr_fold = 4, max_group_n = 5, det_lim = 1) %>% 
  rename(ensg_id = gene)

gene_col1 <- "ensrnog_id"
gene_col2 <- "ensg_id"
sep <- ";"

class1_long <-
  class1 %>%
  select(gene1 = gene_col1, enriched_tissues) %>%
  mutate(
    enriched_tissues = ifelse(is.na(enriched_tissues),
                              "not enriched",
                              enriched_tissues),
    enriched1 = T
  ) %>%
  separate_rows(enriched_tissues, sep = sep)

class2_long <-
  class2 %>%
  select(gene2 = gene_col2, enriched_tissues) %>%
  mutate(
    enriched_tissues = ifelse(is.na(enriched_tissues),
                              "not enriched",
                              enriched_tissues),
    enriched2 = T
  ) %>%
  separate_rows(enriched_tissues, sep = sep)

tis_ <-
  unique(c(class1_long$enriched_tissues,
           class2_long$enriched_tissues)) %>%
  sort()

gene_orthologs <- homolog_data %>% select(1,2)

overlap_hyper_all <- expand.grid(id1 = tis_,
            id2 = tis_,
            gene = 1:nrow(gene_orthologs)) %>%
  as_tibble() %>%
  left_join(gene_orthologs %>%
              select(gene1 = gene_col1,
                     gene2 = gene_col2) %>%
              mutate(gene = row_number())) %>%
  select(-gene) %>%
  left_join(class1_long,
            by = c("gene1", "id1" = "enriched_tissues")) %>%
  left_join(class2_long,
            by = c("gene2", "id2" = "enriched_tissues")) %>%
  mutate(
    enriched1 = ifelse(is.na(enriched1),
                       F,
                       enriched1),
    enriched2 = ifelse(is.na(enriched2),
                       F,
                       enriched2)
  ) %>%
  group_by(id1, id2, enriched1, enriched2) %>%
  count() %>%
  group_by(id1, id2) %>%
  summarise(
    # q is the number of successes
    q = sum(n[which(enriched1 & enriched2)]),
    
    # k is the number of tries - i.e. the number of genes that are elevated for either species
    k = sum(n[which(enriched1 | enriched2)]),
    
    # m is the number of possible successes - i.e. the number of genes that are elevated for either
    m = min(sum(n[which(enriched1)]),
            sum(n[which(enriched2)])),
    
    # n is the population size - i.e. the number of genes
    n = sum(n) - m
  ) %>% 
  mutate(p_value = phyper(q - 1, m, n, k, lower.tail = F)) %>%
  mutate(p_value = ifelse(p_value == 0, .Machine$double.xmin, p_value),
         adj_pval = p.adjust(p_value, method = "BH")) %>%
  rename(rat_id = id1, 
         human_id = id2)
Joining, by = "gene"`summarise()` has grouped output by 'id1'. You can override using the `.groups` argument.
overlap_hyper_all %>% 
  write_csv("./data/final_data/rat_human_class_hyper.csv")


plot_order <- 
  overlap_hyper_all %>% 
  ungroup() %>%
  mutate(rat_id = str_to_sentence(rat_id),
         human_id = str_to_sentence(human_id)) %>%
  filter(rat_id == human_id) %>%
  arrange(adj_pval) %>%
  pull(rat_id)


stripped_theme <-
  theme(panel.background = element_rect(fill = NA, colour = NA),
        plot.background = element_rect(fill = NA, color = NA),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        legend.key = element_rect(colour = NA),
        #legend.position = "bottom",
        #legend.direction = "horizontal",
        legend.key.size= unit(0.3, "cm"),
        legend.title = element_text(face="italic"),
        axis.line = element_line(colour="black",size=0.5))


overlap_hyper_all %>% 
  group_by(rat_id, human_id) %>%
  mutate(capped_p = min(c(-log10(adj_pval), 20))) %>%
  ungroup() %>% 
  mutate(rat_id = str_to_sentence(rat_id),
         human_id = str_to_sentence(human_id)) %>%
  mutate(rat_id = factor(rat_id, plot_order),
         human_id = factor(human_id, plot_order)) %>%
  ggplot(aes(human_id, rat_id, fill = capped_p)) +
  geom_tile() + 
  geom_tile(data = . %>% 
              filter(adj_pval >= 0.05),
            fill = "black") + 
  scale_fill_viridis(option = "D", direction = 1, 
                     name = "Adjusted p-value") + 
  stripped_theme +
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.2),
        legend.position = "top") +
  xlab("Human") + 
  ylab("Rat")


#ggsave("./final_plots/comparison/Overlap hyper heatmap elevated.pdf", width = 6, height = 6)  

overlap_hyper_all %>% 
  filter(human_id != "not enriched" & 
           rat_id != "not enriched") %>%
  ungroup() %>%
  mutate(rat_id = str_to_sentence(rat_id),
         human_id = str_to_sentence(human_id)) %>%
  filter(adj_pval < 0.05) %>%
  mutate(rat_id = factor(rat_id, plot_order),
         human_id = factor(human_id, plot_order),
         adj_pval = case_when(adj_pval < 1e-100 ~ 1e-100,
                              T ~ adj_pval)) %>%
  ggplot(aes(human_id, rat_id, fill = -log10(adj_pval), size = -log10(adj_pval))) +
  geom_point(shape = 21, 
             alpha = 0.8,
             stroke = 0.2,
             color = "black") + 
  # scale_color_viridis(option = "E", direction = 1, 
  #                    name = "Adjusted p-value") + 
  scale_fill_gradientn(colors = ggsci::rgb_material(palette = "deep-orange")) +
  scale_size_continuous(range = c(1, 6)) +
  stripped_theme +
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.2),
        legend.position = "top", 
        panel.grid.major = element_line(color = "gray90", size = 0.2)) +
  xlab("Human") + 
  ylab("Rat")

ggsave("./final_plots/comparison/hypergeometric_comparison_tissue.pdf", width = 5, height = 5.5)  
Warning: ‘mode(bg)’ differs between new and previous
     ==> NOT changing ‘bg’

##Figure 7D - Cross species gene annotation alluvial


#classification rat
classification_rat_comp <-
  hpa_gene_classification(
    data = joined_atlas_comparison %>% filter(species == "rat") %>% select(c(-id,-species,-limma_log1p_tmm)),
    expression_col = "tmm",
    tissue_col = "tissue",
    gene_col = "mutual_id",
    enr_fold = 4,
    max_group_n = 5,
    det_lim = 1
  )

rat_comp_tau <- calculate_tau_score(
  joined_atlas_comparison %>% filter(species == "rat") %>% select(-id,-species,-limma_log1p_tmm)  %>%
    spread(tissue, tmm) %>%
    mutate_if(is.numeric, function(x) {
      log10(x + 1)
    }) %>%
    column_to_rownames("mutual_id")
)

classification_rat_comp <- classification_rat_comp %>%
  left_join(rat_comp_tau, by = c("gene" = "gene")) %>%
  mutate(tau_score = ifelse(spec_category == "not detected", NA, tau_score)) %>%
  mutate(species = "Rat")


#classification human

classification_human_comp <-
  hpa_gene_classification(
    data = joined_atlas_comparison %>% filter(species == "human") %>% select(c(-id,-species,-limma_log1p_tmm)),
    expression_col = "tmm",
    tissue_col = "tissue",
    gene_col = "mutual_id",
    enr_fold = 4,
    max_group_n = 5,
    det_lim = 1
  )

human_comp_tau <- calculate_tau_score(
  joined_atlas_comparison %>% filter(species == "human") %>% select(-id,-species,-limma_log1p_tmm)  %>%
    spread(tissue, tmm) %>%
    mutate_if(is.numeric, function(x) {
      log10(x + 1)
    }) %>%
    column_to_rownames("mutual_id")
)

classification_human_comp <- classification_human_comp %>%
  left_join(rat_comp_tau, by = c("gene" = "gene")) %>%
  mutate(tau_score = ifelse(spec_category == "not detected", NA, tau_score)) %>%
  mutate(species = "Human")
alluvial_data_comp <- classification_rat_comp %>%
  select(gene, spec_category) %>%
  rename(Rat = spec_category) %>%
  left_join(
    classification_human_comp %>%
      select(gene, spec_category) %>%
      rename(Human = spec_category), 
    by = "gene"
  )


width = 0.1

alluv_1 <-
  
  alluvial_data_comp %>%
  mutate(Rat = str_to_sentence(Rat),
          Human = str_to_sentence(Human)) %>%
  select(Rat, 
         Human) %>%
  mutate(row_n = row_number()) %>%
  gather(bar, chunk, -row_n) %>%
  mutate(color_vars = 1) %>%
  group_by(row_n) %>%
  mutate(chunk_color = chunk[match(c("Human",
                                     "Rat")[color_vars], bar)]) %>% 
  ungroup() %>%
  
  
  mutate(chunk = factor(chunk, levels = c('Tissue enriched', 'Group enriched', 
                                          'Tissue enhanced', 'Low tissue specificity', 
                                          'Not detected')),
         bar = factor(bar, levels = c("Human",
                                      "Rat"))) %>%
  
  
  ggplot(aes(x = bar, stratum = chunk, alluvium = row_n,
             y = 1)) +
  
  geom_flow(aes(fill = chunk_color), 
            show.legend = F, width = width,
            knot.pos = 1/6) +
  geom_stratum(aes(fill = chunk), 
               show.legend = F, color = NA, width = width) +
  
  scale_x_discrete(expand = c(.1, .1), position = "top") +
  scale_fill_manual(values = c(gene_category_pal)) + 
  
  
  theme(axis.text.y = element_text(size = 18, face = "bold"),
        axis.text.x = element_blank(), 
        axis.ticks = element_blank(), 
        panel.background = element_blank(), 
        axis.title = element_blank()) +
  coord_flip()

# alluv_1

flow_data <-
  ggplot_build(alluv_1)$data[[1]] %>%
  as_tibble() %>%
  {
    if ("side" %in% names(.)) {
      .
    } else{
      mutate(.,
             side = case_when(flow == "from" ~ "start",
                              flow == "to" ~ "end"))
    }
  }



stratum_data <- 
  ggplot_build(alluv_1)$data[[2]]

flow_data_labels <-
  flow_data %>%
  as_tibble() %>%
  select(x, stratum, group, side, ymin, ymax) %>%
  pivot_wider(names_from = side,
              values_from = c(x, stratum, ymin, ymax)) %>%
  mutate_at(
    c(
      "x_end",
      "ymax_end",
      "ymin_end",
      "x_start",
      "ymax_start",
      "ymin_start"
    ),
    as.numeric
  ) %>%
  group_by(stratum_start, stratum_end, x_start, x_end) %>%
  summarise(
    y_end = (min(ymin_end) + max(ymax_end)) / 2,
    y_start = (min(ymin_start) + max(ymax_start)) / 2,
    size = max(ymax_start) - min(ymin_start)
  )
`summarise()` has grouped output by 'stratum_start', 'stratum_end', 'x_start'. You can override using the `.groups` argument.
alluv_2 <- 
  alluv_1 +
  geom_text(data = flow_data_labels,
            aes(x = x_start + width/2,
                y = y_start, 
                label = size), 
            inherit.aes = F, 
            size = 3,
            angle = -90,
            hjust = 1,
            vjust = 0.5) +
  geom_text(data = flow_data_labels,
            aes(x = x_end - width/2,
                y = y_end, 
                label = size), 
            inherit.aes = F, 
            size = 3, 
            angle = -90,
            hjust = 0,
            vjust = 0.5) +
  
  # Stratum label
  
  geom_text(data = stratum_data %>%
              filter(x == 1),
            aes(x = x - width/2,
                y = y,
                label = stratum),
            size = 4, 
            vjust = 1.5,
            inherit.aes = F) + 
  geom_text(data = stratum_data %>%
              filter(x == 2),
            aes(x = x + width/2,
                y = y,
                label = stratum),
            size = 4, 
            vjust = -0.5,
            inherit.aes = F) + 
  
  geom_text(data = stratum_data,
            aes(x = x, 
                y = y,
                label = ymax - ymin), 
            size = 4, 
            fontface = "bold",
            color = "white",
            inherit.aes = F)


alluv_2

if (include_many2many == TRUE & include_one2many == TRUE){
  comp_alluv_file_name = paste0("./final_plots/comparison/",ortholog_data_from,"_comparison_all_alluvial.pdf")
} else if (include_many2many == FALSE & include_one2many == TRUE){
  comp_alluv_file_name = paste0("./final_plots/comparison/",ortholog_data_from,"_comparison_only_on2one2many_alluvial.pdf")
} else if (include_many2many == FALSE & include_one2many == FALSE){
  comp_alluv_file_name = paste0("./final_plots/comparison/",ortholog_data_from,"_comparison_only_one2one_alluvial.pdf")}

ggsave(comp_alluv_file_name,width = 8, height = 3)

#Figure S1 - Brain Metadata alluvial plot

#Function adapted from Max Karlsson
multi_alluvial_plot <-
  function(data,
           vars,
           chunk_levels,
           pal,
           color_by = c(1, 3, 3)) {
    selvars = vars
    
    if (!is.null(names(vars))) {
      vars = names(vars)
    }
    
    alluv_1 <-
      data %>%
      ungroup() %>%
      select(selvars) %>%
      ungroup() %>%
      mutate(row_n = row_number()) %>%
      gather(bar, chunk,-row_n) %>%
      left_join(tibble(bar = vars,
                       color_vars = color_by),
                by = "bar") %>%
      group_by(row_n) %>%
      mutate(chunk_color = chunk[match(vars[color_vars], bar)]) %>%
      ungroup() %>%
      
      mutate(chunk = factor(chunk, levels = chunk_levels),
             bar = factor(bar, levels = vars)) %>%
      
      
      ggplot(aes(
        x = bar,
        stratum = chunk,
        alluvium = row_n,
        y = 1
      )) +
      
      geom_flow(aes(fill = chunk_color),
                show.legend = F) +
      geom_stratum(aes(fill = chunk),
                   show.legend = F, color = NA) +
      
      scale_x_discrete(expand = c(.1, .1), position = "top") +
       scale_fill_manual(values = pal) +

      theme(
        axis.text.x = element_text(size = 18, face = "bold"),
        axis.text.y = element_blank(),
        axis.ticks = element_blank(),
        panel.background = element_blank(),
        axis.title = element_blank()
      )

    
    
    flow_data <-
      ggplot_build(alluv_1)$data[[1]] %>%
      as_tibble() %>%
      {
        if ("side" %in% names(.)) {
          .
        } else{
          mutate(.,
                 side = case_when(flow == "from" ~ "start",
                                  flow == "to" ~ "end"))
        }
      }
    
    
    stratum_data <-
      ggplot_build(alluv_1)$data[[2]]
    
    flow_data_labels <-
      flow_data %>%
      as_tibble() %>%
      
      select(x, stratum, group, side, ymin, ymax) %>%
      pivot_wider(names_from = side,
                  values_from = c(x, stratum, ymin, ymax)) %>%
      
      mutate_at(
        c(
          "x_end",
          "ymax_end",
          "ymin_end",
          "x_start",
          "ymax_start",
          "ymin_start"
        ),
        as.numeric
      ) %>%
      group_by(stratum_start, stratum_end, x_start, x_end) %>%
      summarise(
        y_end = (min(ymin_end) + max(ymax_end)) / 2,
        y_start = (min(ymin_start) + max(ymax_start)) / 2,
        size = max(ymax_start) - min(ymin_start)
      )
    
    alluv_1 <-
      alluv_1 +
      # geom_text(
      #   data = flow_data_labels,
      #   aes(x = x_start + 1 / 6,
      #       y = y_start,
      #       label = size),
      #   inherit.aes = F,
      #   size = 3,
      #   hjust = 0
      # ) +
      # geom_text(
      #   data = flow_data_labels,
      #   aes(x = x_end - 1 / 6,
      #       y = y_end,
      #       label = size),
      #   inherit.aes = F,
      #   size = 3,
      #   hjust = 1
      # ) +
      # 
      # Stratum label
      geom_text(
        data = stratum_data,
        aes(
          x = x,
          y = y,
          label = paste(stratum#, 
                       # paste("[", ymax - ymin, "]", sep = "")
                        )
        ),
        size = 4,
        inherit.aes = F
      )
                
    
    alluv_1
  }
metadata_b <- metadata %>% filter(organ_name =="Brain")

t_names <- metadata_b$tissue_name %>% unique()
r_names <- metadata_b$region_tissue_name %>% unique()
c_names <- metadata_b$consensus_tissue_name %>% unique()
o_names <- metadata_b$organ_name %>% unique()

t_colors <- metadata_b %>% select(tissue_name, tissue_color) %>% unique() %>% rename (chunk = tissue_name, color = tissue_color)
r_colors <- metadata_b %>% select(region_tissue_name, region_tissue_color) %>% unique() %>% rename (chunk = region_tissue_name, color = region_tissue_color)
c_colors <- metadata_b %>% select(consensus_tissue_name, consensus_tissue_color) %>% unique() %>% rename (chunk = consensus_tissue_name, color = consensus_tissue_color)
o_colors <- metadata_b %>% select(organ_name, organ_color) %>% unique() %>% rename (chunk = organ_name, color = organ_color)

bind_colors <- bind_rows(t_colors, r_colors, o_colors) %>% unique() %>% arrange(chunk) %>% mutate(row_n = row_number())
pal <-  bind_colors$color
pal <- setNames(pal, bind_colors$chunk)

data = metadata_b
vars = c("tissue_name", "region_tissue_name", "organ_name")
chunk_levels = c(t_names, r_names, o_names) %>% unique()
color_by = c(1, 3, 3)

multi_alluvial_plot(data = metadata_b, vars = vars, chunk_levels = chunk_levels, pal = pal, color_by = c(1, 3, 3))
`summarise()` has grouped output by 'stratum_start', 'stratum_end', 'x_start'. You can override using the `.groups` argument.
ggsave("final_plots/alluvial/brain-tco-1_p.pdf", height = 7, width = 10)

#Figure S2 - Normalisation comparison TPM vs nTPM (TMM)

tpm_sample <-read_csv("./data/final_data/curated_pTPM_rattus_norvegicus_v103.csv")
Rows: 22245 Columns: 353── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr   (1): target_id
dbl (352): ventral.forebrain_male.3, ventral.forebrain_male.2, ventral.forebrain_female.3, ventral.forebrain_female...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sample_subset_ID <- metadata[match(unique(metadata$consensus_tissue_name), metadata$consensus_tissue_name),] %>% select(ID)
sample_subset_tmm <- tmm_sample %>% select(target_id, sample_subset_ID$ID) %>% gather(sample, tmm, -1)
sample_subset_tpm <- tpm_sample %>% select(target_id, sample_subset_ID$ID) %>% gather(sample, tpm, -1)

# plot_data <- left_join(sample_subset_tmm, sample_subset_tpm, by = c("target_id", "sample")) %>%
#   mutate(log1p_tmm = log10(tmm + 1), log1p_tpm = log10(tpm +1)) %>% select(-tmm, -tpm) %>% 
#   gather(expression_type, expression, -1, -2) %>% 
#   left_join(metadata %>% select(ID, tissue_name, consensus_tissue_name), by = c("sample" = "ID"))

plot_data <- left_join(sample_subset_tmm, sample_subset_tpm, by = c("target_id", "sample")) %>%
  gather(expression_type, expression, -1, -2) %>% mutate_if(is.numeric, function(x) {
                log10(x + 1)
              }) %>% 
  left_join(metadata %>% select(ID, tissue_name, consensus_tissue_name), by = c("sample" = "ID"))


pal_tbl <- metadata %>% select(consensus_tissue_name, consensus_tissue_color) %>% distinct()
pal <- pal_tbl %>% pull(consensus_tissue_color)
pal <- setNames(pal, pal_tbl %>% pull(consensus_tissue_name))


ggplot(data = plot_data, aes(x = expression, y =sample, fill = consensus_tissue_name)) +
  geom_boxplot(draw_quantiles = 0.5, outlier.size = 0.5, outlier.alpha = 0.3)+
  facet_wrap(~expression_type)+
  scale_fill_manual(values = pal) +
  theme(legend.position = "none", 
        axis.title = element_blank())
Warning: Ignoring unknown parameters: `draw_quantiles`
ggsave("./final_plots/tpm_tmm_comp_boxplot.pdf", width=7, height = 12)

#Figure S3 - Spearman heatmap (tissye type level)


##Spearman's roh heatmap at tissue type level
if(file.exists("./data/final_data/spearman_corr_tissues.csv")) {
  tissue_tmm_spearman <- read_csv("./data/final_data/spearman_corr_tissues.csv")
} else {
  tissue_tmm_spearman <-  tmm_tissue %>%
    spread(tissue_name, tmm) %>%
    column_to_rownames("target_id") %>%
    cor(method = "spearman", use = "pairwise.complete.obs") %>%
    as.data.frame() %>%
    as_tibble(rownames = "tissue_name")
  write_csv(as.data.frame(tissue_tmm_spearman) %>% as_tibble(rownames = "tissue_name"),"./data/final_data/spearman_corr_tissues.csv")
}
Rows: 100 Columns: 101── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr   (1): tissue_name
dbl (100): abdominal adipose tissue, adrenal gland, amygdala, aorta, artery, bone marrow, breast, bronchus, brown a...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tissue_tmm_spearman %>% 
  column_to_rownames("tissue_name") %>% 
  pheatmap(
   # clustering_method = "ward.D2",
    cellheight = 8,
    cellwidth = 8, 
    border_color = NA,
    color = viridis::inferno(20, direction = -1),
    show_rownames = FALSE, 
    ) %>% 
  as.ggplot()


ggsave("./final_plots/data_presentation/spearman_corr_tissue.pdf", height = 20, width = 20)

NA
NA

#Figure S4 - Comparison Pheatmap

if(file.exists("./data/final_data/spearman_corr_tissues.csv")) {
  tissue_tmm_spearman <- read_csv("./data/final_data/spearman_corr_tissues.csv")
} else {
  tissue_tmm_spearman <-  tmm_tissue %>%
    spread(tissue_name, tmm) %>%
    column_to_rownames("target_id") %>%
    cor(method = "spearman", use = "pairwise.complete.obs") %>%
    as.data.frame() %>%
    as_tibble(rownames = "tissue_name")
  write_csv(as.data.frame(tissue_tmm_spearman) %>% as_tibble(rownames = "tissue_name"),"./data/final_data/spearman_corr_tissues.csv")
}
Rows: 100 Columns: 101── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr   (1): tissue_name
dbl (100): abdominal adipose tissue, adrenal gland, amygdala, aorta, artery, bone marrow, breast, bronchus, brown a...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
joined_atlas_comparison_pheat_data <-  joined_atlas_comparison %>%
  select(mutual_id, id,limma_log1p_tmm ) %>%  
  spread(id, limma_log1p_tmm) %>% 
  column_to_rownames("mutual_id") %>% 
  cor(method = "spearman", use = "pairwise.complete.obs") %>%
  as.data.frame() %>%
  as_tibble(rownames = "tissue_name")

joined_atlas_comparison_pheat_data %>% 
  column_to_rownames("tissue_name") %>% 
  pheatmap(
   # clustering_method = "ward.D2",
    cellheight = 8,
    cellwidth = 8, 
    border_color = NA,
    color = viridis::inferno(20, direction = -1),
    show_rownames = FALSE, 
    ) %>% 
  as.ggplot()


ggsave("./final_plots/comparison/pheatmap.pdf", width = 20, height = 20)

sessionInfo()
R version 4.2.1 (2022-06-23)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Ventura 13.1

Matrix products: default
LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] ggsci_2.9           geomtextpath_0.1.1  plotly_4.10.1       influential_2.2.6   magrittr_2.0.3     
 [6] viridis_0.6.2       viridisLite_0.4.1   igraph_1.3.5        patchwork_1.1.2     ggraph_2.1.0       
[11] ggrepel_0.9.2       ggplotify_0.1.0     pheatmap_1.0.12     uwot_0.1.14.9000    Matrix_1.5-3       
[16] ggdendro_0.1.23     ggalluvial_0.12.3   forcats_0.5.2       stringr_1.5.0       dplyr_1.0.10       
[21] purrr_0.3.5         readr_2.1.3         tidyr_1.2.1         tibble_3.1.8        ggplot2_3.4.0      
[26] tidyverse_1.3.2     biomaRt_2.52.0      pcaMethods_1.88.0   Biobase_2.56.0      BiocGenerics_0.42.0

loaded via a namespace (and not attached):
  [1] readxl_1.4.1           backports_1.4.1        BiocFileCache_2.4.0    systemfonts_1.0.4     
  [5] plyr_1.8.8             lazyeval_0.2.2         sp_1.5-1               splines_4.2.1         
  [9] NOISeq_2.40.0          GenomeInfoDb_1.32.4    digest_0.6.31          yulab.utils_0.0.5     
 [13] htmltools_0.5.4        fansi_1.0.3            memoise_2.0.1          googlesheets4_1.0.1   
 [17] cluster_2.1.4          tzdb_0.3.0             limma_3.52.4           Biostrings_2.64.1     
 [21] graphlayouts_0.8.4     modelr_0.1.10          vroom_1.6.0            timechange_0.1.1      
 [25] prettyunits_1.1.1      colorspace_2.0-3       blob_1.2.3             rvest_1.0.3           
 [29] rappdirs_0.3.3         textshaping_0.3.6      haven_2.5.1            xfun_0.35             
 [33] crayon_1.5.2           RCurl_1.98-1.9         jsonlite_1.8.4         glue_1.6.2            
 [37] polyclip_1.10-4        gtable_0.3.1           gargle_1.2.1           zlibbioc_1.42.0       
 [41] XVector_0.36.0         kernlab_0.9-31         prabclus_2.3-2         DEoptimR_1.0-11       
 [45] scales_1.2.1           DBI_1.1.3              Rcpp_1.0.9             progress_1.2.2        
 [49] gridGraphics_0.5-1     bit_4.0.5              mclust_6.0.0           stats4_4.2.1          
 [53] htmlwidgets_1.5.4      httr_1.4.4             FNN_1.1.3.1            RColorBrewer_1.1-3    
 [57] fpc_2.2-9              modeltools_0.2-23      ellipsis_0.3.2         pkgconfig_2.0.3       
 [61] XML_3.99-0.13          flexmix_2.3-18         farver_2.1.1           sass_0.4.4            
 [65] nnet_7.3-18            dbplyr_2.2.1           utf8_1.2.2             labeling_0.4.2        
 [69] tidyselect_1.2.0       rlang_1.0.6            AnnotationDbi_1.58.0   munsell_0.5.0         
 [73] cellranger_1.1.0       tools_4.2.1            cachem_1.0.6           cli_3.4.1             
 [77] generics_0.1.3         RSQLite_2.2.19         broom_1.0.1            evaluate_0.19         
 [81] fastmap_1.1.0          ragg_1.2.4             yaml_2.3.6             knitr_1.41            
 [85] bit64_4.0.5            fs_1.5.2               tidygraph_1.2.2        robustbase_0.95-0     
 [89] KEGGREST_1.36.3        xml2_1.3.3             compiler_4.2.1         rstudioapi_0.14       
 [93] filelock_1.0.2         curl_4.3.3             png_0.1-8              reprex_2.0.2          
 [97] tweenr_2.0.2           bslib_0.4.1            stringi_1.7.8          lattice_0.20-45       
[101] vctrs_0.5.1            pillar_1.8.1           lifecycle_1.0.3        jquerylib_0.1.4       
[105] data.table_1.14.6      irlba_2.3.5.1          bitops_1.0-7           R6_2.5.1              
[109] gridExtra_2.3          IRanges_2.30.1         MASS_7.3-58.1          assertthat_0.2.1      
[113] withr_2.5.0            S4Vectors_0.34.0       GenomeInfoDbData_1.2.8 diptest_0.76-0        
[117] parallel_4.2.1         hms_1.1.2              grid_4.2.1             class_7.3-20          
[121] rmarkdown_2.18         googledrive_2.0.0      ggforce_0.4.1          lubridate_1.9.0       
---
title: "R Notebook"
output:
  html_document:
    df_print: paged
  html_notebook: default
  pdf_document: default
---
```{r}
library(tidyverse)
library(ggplot2)
library(ggalluvial)
library(biomaRt)
library(ggdendro)
library(pcaMethods)
library(uwot)
library(pheatmap)
library(ggplotify)
library (ggrepel)
library(ggraph)
library(patchwork)



select <- dplyr::select
```

```{r}
tmm_sample <- read_csv("./data/final_data/curated_pTMM_rattus_norvegicus_v103.csv")
metadata <- read_csv("./data/final_data/curated_metadata.csv")
all(sort(colnames(tmm_sample[-1])) == sort(metadata$ID))

```

#Generate hierarchical data
```{r}


#slow, but understanable
tmm_tissue <- tmm_sample %>% 
  gather(sample, tmm, -1) %>% 
  left_join(metadata %>% select(ID,tissue_name), by = c("sample" = "ID")) %>% 
  group_by(target_id, tissue_name) %>% 
  summarize(tmm = mean(tmm)) %>% 
  ungroup()

write_csv(tmm_tissue, file="./data/final_data/final_tmm_tissue_name.csv")

tmm_region_tissue <- tmm_tissue %>% 
  left_join(metadata %>% select(tissue_name,region_tissue_name), by = c("tissue_name" = "tissue_name")) %>% 
  group_by(target_id, region_tissue_name) %>% 
  summarize(tmm = max(tmm)) %>% 
  ungroup()

write_csv(tmm_region_tissue, file="./data/final_data/final_tmm_region_tissue_name.csv")

tmm_consensus_tissue <- tmm_region_tissue %>% 
  left_join(metadata %>% select(region_tissue_name,consensus_tissue_name), by = c("region_tissue_name" = "region_tissue_name")) %>% 
  group_by(target_id, consensus_tissue_name) %>% 
  summarize(tmm = max(tmm)) %>% 
  ungroup()

write_csv(tmm_consensus_tissue, file="./data/final_data/final_tmm_consensus_name.csv")

#check
all(metadata$tissue_name %>% unique() %>% sort() == tmm_tissue$tissue_name %>% unique() %>% sort())
all(metadata$region_tissue_name %>% unique() %>% sort() == tmm_region_tissue$region_tissue_name %>% unique() %>% sort())
all(metadata$consensus_tissue_name %>% unique() %>% sort() == tmm_consensus_tissue$consensus_tissue_name %>% unique() %>% sort())
```

```{r}
# # faster, but less understandable
# unique_tissue <- select(metadata,tissue_name) %>%
#   distinct()
# tmm_by_tissue <- tibble(target_id = tmm_sample$target_id)
# for (i in unique_tissue$tissue_name) {
#   curent_tissue_meta = metadata[metadata$tissue_name == i,]
#   current_tmm = select(tmm_sample,c(target_id, curent_tissue_meta$ID))
#   means <- current_tmm %>% select(curent_tissue_meta$ID) %>% rowMeans()
#   tmm_by_tissue[toString(i)] <- means
# }
# #tmm_by_tissue <- tmm_by_tissue %>% gather(tissue_name, tmm, -1)
# #write.csv(tmm_by_tissue, file="tmm_tissue_name.csv", row.names = FALSE)
# 
# unique_region <- select(metadata,region_tissue_name) %>%
#   distinct()
# tmm_by_region <- tibble(target_id = tmm_sample$target_id)
# for (i in unique_region$region_tissue_name) {
#   current_region_meta = metadata[metadata$region_tissue_name == i,]
#   current_tmm = select(tmm_by_tissue, c(target_id, unique(current_region_meta$tissue_name)))
#   max <- select(current_tmm,-target_id) %>% apply(1,max)
#   tmm_by_region[toString(i)] <- max
# }
# 
# # write.csv(tmm_by_region, file="tmm_region_name.csv", row.names = FALSE)
# 
# unique_consensus <- select(metadata,consensus_tissue_name) %>%
#   distinct()
# tmm_by_consensus <- tibble(target_id = tmm_sample$target_id)
# for (i in unique_consensus$consensus_tissue_name) {
#   current_consensus_meta = metadata[metadata$consensus_tissue_name == i,]
#   current_tmm = select(tmm_by_region, c(target_id, unique(current_consensus_meta$region_tissue_name)))
#   max <- select(current_tmm,-target_id) %>% apply(1,max)
#   tm_by_consensus[toString(i)] <- max
# }

#write.csv(tmm_by_consensus, file="tmm_consensus_name.csv", row.names = FALSE)
```

#Figure 1
##Figure 1A - Hierarchy Overview
```{r}
tissue_colors_palette_full <- rbind(
  metadata %>% select(name = tissue_name, color = tissue_color), 
  metadata %>% select(name = consensus_tissue_name, color = consensus_tissue_color),
  metadata %>% select(name = organ_name, color = organ_color)
) %>% distinct() %>% 
  mutate(name = str_to_sentence(name)) %>% arrange(name)

pal <- tissue_colors_palette_full$color
pal <- set_names(pal,tissue_colors_palette_full$name )


plot_data1 <- 
  metadata %>%
  select(tissue_name, region_tissue_name, consensus_tissue_name, organ_name) %>% #,
         #tissue_color, region_tissue_color, consensus_tissue_color, organ_color) %>% 
  mutate(tissue_name = str_to_sentence(tissue_name), region_tissue_name = str_to_sentence(region_tissue_name), consensus_tissue_name = str_to_sentence(consensus_tissue_name), organ_name = str_to_sentence( organ_name)) %>%   unique() %>% 
  mutate(organ_name = factor(case_when(organ_name == "Male reproductive system" ~ 
                                         "Male tissues",
                                       organ_name == "Breast and female reproductive system" ~
                                         "Female tissues",
                                       organ_name == "Adipose & soft tissue" ~ 
                                         "Connective & soft tissue",
                                       organ_name == "Bone marrow & immune system" ~
                                         "Bone marrow & lymphoid tissues",
                                       T ~ organ_name),
                             c("Brain",
                               "Eye",
                               "Endocrine tissues",
                               "Respiratory system",
                               "Proximal digestive tract",
                               "Gastrointestinal tract",
                               "Liver & gallbladder",
                               "Kidney & urinary bladder",
                               "Pancreas",
                               "Male tissues",
                               "Female tissues",
                               "Muscle tissues",
                               "Connective & soft tissue",
                               "Skin",
                               "Bone marrow & lymphoid tissues")))

plot_data1 <- plot_data1 %>% 
  arrange(organ_name,
          consensus_tissue_name,
          region_tissue_name,
          tissue_name) %>% 
  mutate(plot_order = row_number())

plot_data2 <- 
  plot_data1 %>%
  select(-region_tissue_name) %>%
  gather(column, label, -plot_order) %>%
  group_by(label, column) %>% 
  summarise(plot_order = mean(plot_order))  %>%
  ungroup() %>% 
  mutate(label = label,
         column = factor(column,
                         c("organ_name", 
                           "consensus_tissue_name", 
                           "tissue_name")))

plot_data3 <- 
  plot_data1 %>% 
  group_by(consensus_tissue_name) %>% 
  mutate(left_pos = mean(plot_order))

plot_data4 <- 
  plot_data2 %>% 
  left_join(tissue_colors_palette_full,
            by = c("label" = "name")) %>% 
  group_by(column, label) %>% 
  summarise(miny = min(plot_order) - 0.5,
            maxy = max(plot_order) + 0.5)


ggplot() +
  geom_rect(data = plot_data4, 
           aes(xmin = column, xmax = column, ymin = miny, ymax =  maxy, fill = label), 
           show.legend = F
         ) +
    geom_rect(data = plot_data4 %>% filter (column == "tissue_name"), 
          # aes(xmin = column, xmax = column, ymin = miny, ymax = maxy, color = color), 
           aes(xmin = 3, xmax = 4, ymin = miny, ymax =  maxy, fill = label), 
           show.legend = F
         ) +
    geom_rect(data = plot_data4 %>% filter (column == "consensus_tissue_name"), 
          # aes(xmin = column, xmax = column, ymin = miny, ymax = maxy, color = color), 
           aes(xmin = 1, xmax = 2, ymin = miny, ymax =  maxy, fill = label), 
           show.legend = F, width = 10
         ) +
  geom_segment(
    data = plot_data3,
    aes(
      x = "consensus_tissue_name",
      xend = "tissue_name",
      y = left_pos,
      yend = plot_order,
      color = tissue_name
    ),
    show.legend = F,
    alpha = 0.5,
    size = 2
  )+
  geom_text(
    data = plot_data2 %>% filter(column == "consensus_tissue_name"),
    aes(x = column, y = plot_order, label = label),
    hjust = 1,
    size = 2 * 5 / 6,
    label.padding = unit(0, "mm")
  ) +
  geom_text(
    data = plot_data2 %>% filter(column == "tissue_name"),
    aes(x = column, y = plot_order, label = label),
    hjust = 0,
    size = 2 * 5 / 6,
    show.legend = F,
    label.padding = unit(0, "mm")
  ) +
  geom_label(
    data = plot_data2 %>%
      filter(column == "organ_name"),
    #aes(x = column, y = plot_order, label = label),
    aes(x = column, y = plot_order, label = gsub(" ", "\n", label)),
    show.legend = F,
    label.size = 0,
    hjust = 1,
    lineheight = 0.7,
    label.padding = unit(0, "mm"),
    size = 2 * 5 / 6
  ) +
  scale_y_reverse() +
  scale_x_discrete(labels  = c('Organ system', "Grouped tissue", "Tissue type"),position = "top") +
  scale_fill_manual(values = pal) +
  scale_color_manual(values = pal) +
  theme_void() +
  theme(axis.text.x = element_text())
ggsave("final_plots/tissues.pdf", height = 7, width = 4)

```

##Figure 1B - Ward's Retina-Dendrogramm
```{r}

##Functino based on Max Karlsson's retinogramm
circular_dendrogram_retinastyle_2 <-
  function(clust, color_mapping, label_col, color_col, 
           scale_expansion = c(0.25, 0.25), text_size = 3, width_range = c(1.5, 6), 
           arc_strength = 0.8, default_color = "gray80") {
    require(ggraph)
    require(igraph)
    require(viridis)
    require(tidyverse)
    require(magrittr)
    
    dendrogram <-
      clust %>%
      as.dendrogram()
    
    
    
    g <-
      ggraph(dendrogram, layout = 'dendrogram', circular = T)
    # 
    # g +
    #   geom_edge_fan(data = edge_data %>%
    #                    mutate(hghl = edge.id == 99),
    #                  aes(label = edge_id, color = as.factor(rank_radius)),
    #                  width =4) +
    #   geom_node_text(aes(label = label))
    
    edge_data <- 
      get_edges()(g$data) %>%
      as_tibble() %>%
      left_join(color_mapping %>%
                  select(label = label_col,
                         color = color_col),
                by = c("node2.label" = "label")) %>%
      mutate(radius = xend^2 + yend^2,
             edge.id = as.character(edge.id)) %>%
      arrange(-radius) %>%
      mutate(edge_id = as.character(row_number()),
             rank_radius = unclass(factor(-radius)),
             x_m = round(x, 10),
             y_m = round(y, 10),
             xend_m = round(xend, 10),
             yend_m = round(yend, 10)) 
    
    
    edge_id_colors <- 
      edge_data %>% 
      filter(!is.na(color)) %$%
      set_names(color, edge_id)
    
    
    for(rank_rad in 2:max(edge_data$rank_radius)) {
      edge_id_colors_new <- 
        left_join(edge_data %>%
                    select(edge_id, radius, xend_m, yend_m, rank_radius) %>%
                    filter(rank_radius == rank_rad),
                  edge_data %>%
                    select(edge_id, radius, x_m, y_m, rank_radius) %>%
                    filter(rank_radius < rank_rad),
                  by = c("xend_m" = "x_m", "yend_m" = "y_m")) %>%
        left_join(enframe(edge_id_colors),
                  by = c("edge_id.y" = "name")) %>%
        group_by(edge_id.x) %>% 
        summarise(color = ifelse(n_distinct(value) == 1 & any(value != default_color), 
                                 as.character(unique(value)),
                                 default_color)) %$%
        set_names(color, edge_id.x)
      edge_id_colors <- 
        c(edge_id_colors, edge_id_colors_new)
    }
    
    
    
    g +
      scale_edge_width(range = width_range)+
      geom_edge_diagonal(data = edge_data,
                         aes(edge_color = as.character(edge_id),
                             edge_width = 1 - sqrt(xend^2 + yend^2)),
                         strength = arc_strength,
                         show.legend = F) +
      
      
      scale_edge_color_manual(values = edge_id_colors)  +
      g$data %>%
      filter(label != "") %>%
      mutate(degree = case_when(x >= 0 ~ asin(y) * 180 / pi,
                                x < 0 ~ 360 - asin(y) * 180 / pi)) %>%
      left_join(color_mapping %>%
                  select(label = label_col,
                         color = color_col),
                by = "label") %>%
                {geom_node_text(data = .,
                                aes(label = label),
                                angle = .$degree,
                                hjust = ifelse(.$x < 0, 
                                               1, 
                                               0),
                                vjust = 0.5,
                                size = text_size)}  +
      scale_x_continuous(expand = expand_scale(scale_expansion)) +
      scale_y_continuous(expand = expand_scale(scale_expansion)) +
      
      coord_fixed() +
      theme_void()
  }
```


```{r}

hclust4RNAseq_ward <- function(df, correlation_method = "spearman"){
  #wide dataframe as input 
  #to get correlation between samples, where rows are genes columns are samples
  #to get correlation between genes across samples, input df with genes as columns
  #can use later for dendogram making: ggdendrogram([hclust4RNAseq_results], rotate = FALSE, size = 10, face = "bold")
  similarity <- cor(df, method=correlation_method, use="pairwise.complete.obs")
  dissimilarity <- 1 - similarity
  hcl <- hclust(as.dist(dissimilarity), "ward.D2")
  return (hcl)
} 

tissue_dendro_ward <- hclust4RNAseq_ward(tmm_tissue %>% mutate(tissue_name = str_to_sentence(tissue_name)) %>%  spread(tissue_name, tmm) %>% column_to_rownames("target_id"))

circular_dendrogram_retinastyle_2(
  clust = tissue_dendro_ward, 
  color_mapping = metadata %>% 
    select(tissue_name, tissue_color) %>% 
    mutate(tissue_name = str_to_sentence(tissue_name)), 
  label_col = "tissue_name", 
  color_col = "tissue_color", 
  scale_expansion = c(0.7, 0.7), 
  text_size = 2.4, 
  width_range = c(0.5, 4),
  arc_strength = 0.4, 
  default_color = "gray80")

ggsave("./final_plots/data_presentation/retinagram_all_tissue_clust_ward.pdf", width = 6, height = 6)

```

##Figure 1C - Spearman heatmap (grouped tissue)
```{r}
##Spearman's roh heatmap at grouped tissue level

if(file.exists("./data/final_data/spearman_corr_consensus_tissues.csv")) {
  consensus_tmm_spearman <- read_csv("./data/final_data/spearman_corr_consensus_tissues.csv")
} else {
  consensus_tmm_spearman <-  tmm_consensus_tissue %>% 
    spread(consensus_tissue_name, tmm) %>% 
    column_to_rownames("target_id") %>% 
    cor(method="spearman", use="pairwise.complete.obs") %>% 
    as.data.frame() %>% 
    as_tibble(rownames = "consensus_tissue_name")
  write_csv(consensus_tmm_spearman,"./data/final_data/spearman_corr_consensus_tissues.csv")
}

consensus_tmm_spearman %>% 
  rename_with(str_to_sentence, -1) %>% 
  mutate(consensus_tissue_name = str_to_sentence(consensus_tissue_name)) %>% 
  column_to_rownames("consensus_tissue_name") %>% 
  pheatmap(
   # clustering_method = "ward.D2",
    cellheight = 8,
    cellwidth = 8, 
    border_color = NA,
    color = viridis::inferno(20, direction = -1),
    show_rownames = FALSE, 
    ) %>% 
  as.ggplot()
ggsave("./final_plots/data_presentation/spearman_corr_consensus_tissue.pdf", height = 10, width = 10)

```


#Figure 2
##Figure 2A - Sample level PCA Plot
```{r}

sample_pca <-
  tmm_sample %>% 
  
  # gather(sample_name, tmm, -1) %>%
  # group_by(target_id) %>%
  # mutate(sd = sd(tmm)) %>%
  # ungroup() %>%
  # filter(sd > 0) %>%
  # select(-sd) %>%
  # spread(sample_name, tmm) %>%
  
  mutate_if(is.numeric, function(x){log10(x+1)}) %>%
  column_to_rownames("target_id") %>%
  scale() %>%
  t() %>%
  pca(nPcs = 8)

#sample_pca@scores

summary(sample_pca)

plot_data <- sample_pca %>% 
  scores() %>% 
  as_tibble(rownames = "sample_id") %>% 
  left_join(metadata,
            by = c("sample_id" = "ID"))


organ_colors <- metadata %>% select(organ_name, organ_color) %>% unique()
pal <-  organ_colors$organ_color
pal <- setNames(pal, organ_colors$organ_name)

plot_data %>%
  ggplot(aes(PC1, PC2)) +
  geom_point(aes(fill = organ_name), color = "gray25", alpha = 0.7, shape=21, size = 2, stroke = 0.5) +
  scale_fill_manual(values = pal) + 
  #geom_text(aes(label = sample_id), vjust = 1,hjust = 0, nudge_y =-1) +
  xlab(paste("PC1", sample_pca@R2[1] * 100, "% of the variance")) +
  ylab(paste("PC2", sample_pca@R2[2] * 100, "% of the variance")) +
  theme_classic() + theme(legend.text = element_text(size = 10),
                          legend.title =element_blank(), 
                          legend.position = "bottom",
                          legend.spacing.y = unit(-0.3, 'cm'),
                          legend.spacing.x = unit(-0.01, 'cm')) +
  guides(fill = guide_legend(ncol = 3, byrow = TRUE)) 


#ggsave("./final_plots/data_presentation/sample_pca_1.pdf", width = 8, height = 8)
# ggsave("./final_plots/data_presentation/sample_pca_1_w_filter.pdf", width = 8, height = 8)
# 
# plot_data %>%
#   ggplot(aes(PC1, PC2)) +
#   geom_point(aes(fill = organ_name), color = "gray25", alpha = 0.7, shape=21, size = 2, stroke = 0.5) +
#   scale_fill_manual(values = pal) + 
#   #geom_text(aes(label = sample_id), vjust = 1,hjust = 0, nudge_y =-1) +
#   xlab(paste("PC1", sample_pca@R2[1] * 100, "% of the variance")) +
#   ylab(paste("PC2", sample_pca@R2[2] * 100, "% of the variance")) +
#   theme_classic() + theme(legend.text = element_text(size = 10),
#                           legend.title =element_blank(), 
#                           legend.position = "bottom",
#                           legend.spacing.y = unit(-0.3, 'cm'),
#                           legend.spacing.x = unit(-0.01, 'cm')) +
#   guides(fill = guide_legend(ncol = 2, byrow = TRUE)) 
# 
# #ggsave("./final_plots/data_presentation/sample_pca_2.pdf", width = 5, height = 5)
# # ggsave("./final_plots/data_presentation/sample_pca_2_w_filter.pdf", width = 7, height = 7)


```

###Extra PCA plots not in report
```{r}
sample_pca <-
  tmm_sample %>% 
  # gather(sample_name, tmm, -1) %>% 
  # group_by(target_id) %>%
  # mutate(sd = sd(tmm)) %>%
  # ungroup() %>% 
  # filter(sd > 0) %>% 
  # select(-sd) %>% 
  # spread(sample_name, tmm) %>% 
  mutate_if(is.numeric, function(x){log10(x+1)}) %>%
  column_to_rownames("target_id") %>%
  scale() %>%
  t() %>%
  pca(nPcs = 8)

#sample_pca@scores

summary(sample_pca)

plot_data <- sample_pca %>% 
  scores() %>% 
  as_tibble(rownames = "sample_id") %>% 
  left_join(metadata,
            by = c("sample_id" = "ID"))


region_colors <- metadata %>% select(region_tissue_name, region_tissue_color, organ_name) %>% unique() %>% filter(organ_name == "Brain") %>% select(-organ_name)
pal <-  region_colors$region_tissue_color
pal <- setNames(pal, region_colors$region_tissue_name)

plot_data %>%
  ggplot(aes(PC1, PC2)) +
  geom_point(aes(fill = region_tissue_name),  color = "gray25", alpha = 0.7, shape=21, size = 2, stroke = 0.5) +
  scale_fill_manual(values = pal, na.value = "#FFFFFF") + 
  xlim(-75,0) +
  ylim(-10,5) +
  xlab(paste("PC1", sample_pca@R2[1] * 100, "% of the variance")) +
  ylab(paste("PC2", sample_pca@R2[2] * 100, "% of the variance")) +
  theme_classic() + theme(legend.text = element_text(size = 10),
                          legend.title =element_blank(), 
                          legend.position = "bottom",
                          legend.spacing.y = unit(-0.3, 'cm'),
                          legend.spacing.x = unit(-0.01, 'cm')) +
  guides(fill = guide_legend(ncol = 2, byrow = TRUE)) 

ggsave("./final_plots/data_presentation/brain_w_all_sample_pca.pdf", width = 5, height = 5)


```

```{r}

brain_pca <-
  tmm_sample %>% select(c(target_id, metadata %>% filter(organ_name =="Brain") %>% .$ID))%>% 
  mutate_if(is.numeric, function(x){log10(x+1)}) %>%
  column_to_rownames("target_id") %>%
  scale() %>%
  t() %>%
  pca(nPcs = 8)

#brain_pca@scores


plot_data <- brain_pca %>% 
  scores() %>% 
  as_tibble(rownames = "sample_id") %>% 
  left_join(metadata,
            by = c("sample_id" = "ID"))


region_colors <- metadata %>% select(region_tissue_name, region_tissue_color) %>% unique()
pal <-  region_colors$region_tissue_color
pal <- setNames(pal, region_colors$region_tissue_name)

plot_data %>%
  ggplot(aes(PC1, PC2)) +
  geom_point(aes(fill = region_tissue_name), color = "gray25", alpha = 0.7, shape=21, size = 2, stroke = 0.5) +
  scale_fill_manual(values = pal) + 
  #geom_text(aes(label = sample_id), vjust = 1,hjust = 0, nudge_y =-1) +
  xlab(paste("PC1", brain_pca@R2[1] * 100, "% of the variance")) +
  ylab(paste("PC2", brain_pca@R2[2] * 100, "% of the variance")) +
  theme_classic() + theme(legend.text = element_text(size = 10),
                          legend.title =element_blank(), 
                          legend.position = "bottom",
                          legend.spacing.y = unit(-0.3, 'cm'),
                          legend.spacing.x = unit(-0.01, 'cm')) +
  guides(fill = guide_legend(ncol = 2, byrow = TRUE)) 

ggsave("./final_plots/data_presentation/brain_sample_pca.pdf", width = 5, height = 5)



```

##Figure 2B - UMAP Plot
```{r}
wide_data = tmm_sample 
seed = 4
n_epochs = 1000
n_neighbors = 15
set.seed(seed)
    
pca_res <-
  wide_data %>% 
  mutate_if(is.numeric, function(x){log10(x+1)}) %>% 
  column_to_rownames(colnames(wide_data)[1]) %>%
  scale() %>%
  t() %>%
  pca(nPcs = dim(.)[1])
    
    
pc_lim <-
  which(pca_res@R2cum > 0.8)[1]

pc_lim_sd <-
  rev(which(pca_res@sDev > 1))[1]

set.seed(seed)
umap_res <- pca_res@scores[, 1:pc_lim] %>%
  umap(n_neighbors = n_neighbors,
       n_epochs = n_epochs) %>%
       #metric = "correlation") %>%
  as_tibble() %>%
  set_names(paste0("UMAP", 1:ncol(.))) %>%
  mutate(sample = rownames(pca_res@scores)) %>%
  select(sample, everything()) %>% 
  left_join(metadata, by = c("sample" = "ID"))

organ_colors <- metadata %>% select(organ_name, organ_color) %>% unique()
pal <-  organ_colors$organ_color
pal <- setNames(pal, organ_colors$organ_name)

# umap_res %>%  ggplot(aes(UMAP1, UMAP2,  color = organ_name)) +
#    geom_point(alpha = 0.8) + scale_color_manual(values = pal)
umap_res %>%  ggplot(aes(UMAP1, UMAP2)) +
  geom_point(
    aes(fill = organ_name),
    color = "gray25",
    alpha = 0.7,
    shape = 21,
    size = 2,
    stroke = 0.5
  ) + scale_fill_manual(values = pal) + 
  # theme_bw() + theme(
  #   panel.border = element_blank(),
  #   panel.grid.major = element_blank(),
  #   panel.grid.minor = element_blank(),
  #   axis.line = element_line(colour = "black"),
  #   legend.title = element_blank()
  # )
  theme_classic() + theme(legend.text = element_text(size = 10),
                          legend.title =element_blank(), 
                          legend.position = "bottom",
                          legend.spacing.y = unit(-0.3, 'cm'),
                          legend.spacing.x = unit(-0.01, 'cm')) +
  guides(fill = guide_legend(ncol = 2, byrow = TRUE)) 

ggsave("./final_plots/data_presentation/sample_umap_euclidean.pdf", width = 5, height = 5.5)
```

##Figure 2C - Brain Umap Plot
```{r}
#Plot focusing only on brain samples, but UMAP was based on the whole sample set, not only brian samples.

wide_data = tmm_sample 
seed = 4
n_epochs = 1000
n_neighbors = 15
set.seed(seed)
    
pca_res <-
  wide_data %>% 
  mutate_if(is.numeric, function(x){log10(x+1)}) %>% 
  column_to_rownames(colnames(wide_data)[1]) %>%
  scale() %>%
  t() %>%
  pca(nPcs = dim(.)[1])
    
    
pc_lim <-
  which(pca_res@R2cum > 0.8)[1]

pc_lim_sd <-
  rev(which(pca_res@sDev > 1))[1]

set.seed(seed)
umap_res <- pca_res@scores[, 1:pc_lim] %>%
  umap(n_neighbors = n_neighbors,
       n_epochs = n_epochs) %>%
       #metric = "correlation") %>%
  as_tibble() %>%
  set_names(paste0("UMAP", 1:ncol(.))) %>%
  mutate(sample = rownames(pca_res@scores)) %>%
  select(sample, everything()) %>% 
  left_join(metadata, by = c("sample" = "ID"))


region_colors <- metadata %>% select(region_tissue_name, region_tissue_color, organ_name) %>% unique() %>% filter(organ_name == "Brain") %>% select(-organ_name)
pal <-  region_colors$region_tissue_color
pal <- setNames(pal, region_colors$region_tissue_name)
# umap_res %>%  ggplot(aes(UMAP1, UMAP2,  color = organ_name)) +
#    geom_point(alpha = 0.8) + scale_color_manual(values = pal)
umap_res %>%  ggplot(aes(UMAP1, UMAP2)) +
  geom_point(
    aes(fill = region_tissue_name),
    color = "gray25",
    alpha = 0.7,
    shape = 21,
    size = 2,
    stroke = 0.5
  ) + 
  scale_fill_manual(values = pal, na.value = "#FFFFFF") + 
  # theme_bw() + theme(
  #   panel.border = element_blank(),
  #   panel.grid.major = element_blank(),
  #   panel.grid.minor = element_blank(),
  #   axis.line = element_line(colour = "black"),
  #   legend.title = element_blank()
  # )
  xlim(-11,-6) +
  ylim(-8,-2) +
  theme_classic() + theme(legend.text = element_text(size = 10),
                          legend.title =element_blank(), 
                          legend.position = "bottom",
                          legend.spacing.y = unit(-0.3, 'cm'),
                          legend.spacing.x = unit(-0.01, 'cm')) +
  guides(fill = guide_legend(ncol = 2, byrow = TRUE)) 

ggsave("./final_plots/data_presentation/sample_focus_brain_umap_euclidean.pdf", width = 5, height = 5.5)

```
###Extra UMAP plot not in report
```{r}
#UMAP plot based only on brain samples, thus different than the plot above.

wide_data = tmm_sample %>% select(c(target_id, metadata %>% filter(organ_name == "Brain") %>% .$ID))
seed = 4
n_epochs = 1000
n_neighbors = 15
set.seed(seed)
    
pca_res <-
  wide_data %>% 
  mutate_if(is.numeric, function(x){log10(x+1)}) %>% 
  column_to_rownames(colnames(wide_data)[1]) %>%
  scale() %>%
  t() %>%
  pca(nPcs = dim(.)[1])
    
    
pc_lim <-
  which(pca_res@R2cum > 0.8)[1]

pc_lim_sd <-
  rev(which(pca_res@sDev > 1))[1]

set.seed(seed)
umap_res <- pca_res@scores[, 1:pc_lim] %>%
  umap(n_neighbors = n_neighbors,
       n_epochs = n_epochs) %>%
       #metric = "correlation") %>%
  as_tibble() %>%
  set_names(paste0("UMAP", 1:ncol(.))) %>%
  mutate(sample = rownames(pca_res@scores)) %>%
  select(sample, everything()) %>% 
  left_join(metadata, by = c("sample" = "ID"))

region_tissue_colors <- metadata %>% select(region_tissue_name, region_tissue_color) %>% unique()
pal <-  region_tissue_colors$region_tissue_color
pal <- setNames(pal, region_tissue_colors$region_tissue_name)

# umap_res %>%  ggplot(aes(UMAP1, UMAP2,  color = organ_name)) +
#    geom_point(alpha = 0.8) + scale_color_manual(values = pal)
umap_res %>%  ggplot(aes(UMAP1, UMAP2)) +
  geom_point(
    aes(fill = region_tissue_name),
    color = "gray25",
    alpha = 0.7,
    shape = 21,
    size = 2,
    stroke = 0.5
  ) + scale_fill_manual(values = pal) + 
  # theme_bw() + theme(
  #   panel.border = element_blank(),
  #   panel.grid.major = element_blank(),
  #   panel.grid.minor = element_blank(),
  #   axis.line = element_line(colour = "black"),
  #   legend.title = element_blank()
  # )
  theme_classic() + theme(legend.text = element_text(size = 10),
                          legend.title =element_blank(), 
                         # legend.position = "bottom",
                          legend.spacing.y = unit(-0.3, 'cm'),
                          legend.spacing.x = unit(-0.01, 'cm')) +
  guides(fill = guide_legend(ncol = 1, byrow = TRUE)) 

ggsave("./final_plots/data_presentation/sample_brain_umap_euclidean_l.pdf", width = 5.5, height = 3)
```
##Figure 2D -  grouped sample to sample correlation plots
```{r}
if(file.exists("./data/final_data/spearman_sample.csv")) {
  sample_tmm_spearman <- read_csv("./data/final_data/spearman_sample.csv")
} else {
  sample_tmm_spearman <-  tmm_sample %>%
    column_to_rownames("target_id") %>%
    cor(method = "spearman", use = "pairwise.complete.obs") %>%
    as.data.frame() %>%
    as_tibble(rownames = "sample_name")
  write_csv(as.data.frame(sample_tmm_spearman) %>% as_tibble(),"./data/final_data/spearman_sample.csv")
}

correlation_to_different_organs <- tibble(sample_name = c(), correlation = c())
for (sample in metadata$ID){
  sample_organ <- metadata %>% filter(ID == sample) %>% .$organ_name %>% unique()
  different_organ_samples <- metadata %>% filter(organ_name != sample_organ) %>% .$ID
  sample_correlation <- sample_tmm_spearman %>% filter(sample_name %in% different_organ_samples) %>% select("sample_name", sample) %>% rename("correlation" = sample)
  correlation_to_different_organs <- rbind(correlation_to_different_organs, sample_correlation) 
}

correlation_to_same_organ <- tibble(sample_name = c(), correlation = c())
for (sample in metadata$ID){
  sample_organ <- metadata %>% filter(ID == sample) %>% .$organ_name %>% unique()
  same_organ_samples <- metadata %>% filter(organ_name == sample_organ) %>% filter(ID != sample) %>% .$ID
  sample_correlation <- sample_tmm_spearman %>% filter(sample_name %in% same_organ_samples) %>% select("sample_name", sample) %>% rename("correlation" = sample)
  correlation_to_same_organ <- rbind(correlation_to_same_organ, sample_correlation) 
}

correlation_to_same_tissue <- tibble(sample_name = c(), correlation = c())
for (sample in metadata$ID){
  sample_tissue <- metadata %>% filter(ID == sample) %>% .$tissue_name %>% unique()
  same_tissue_samples <- metadata %>% filter(tissue_name == sample_tissue) %>% filter(ID != sample) %>% .$ID
  sample_correlation <- sample_tmm_spearman %>% filter(sample_name %in% same_tissue_samples) %>% select("sample_name", sample) %>% rename("correlation" = sample)
  correlation_to_same_tissue <- rbind(correlation_to_same_tissue, sample_correlation) 
}

correlation_to_same_tissue_by_tissue <- tibble(sample_name = c(), correlation = c(), tissue_name = c())
for (sample in metadata$ID){
  sample_tissue <- metadata %>% filter(ID == sample) %>% .$tissue_name %>% unique()
  same_tissue_samples <- metadata %>% filter(tissue_name == sample_tissue) %>% .$ID
  sample_correlation <- sample_tmm_spearman %>% filter(sample_name %in% same_tissue_samples) %>% select("sample_name", sample) %>% filter(sample_name != sample) %>%  left_join(metadata %>% select(ID, tissue_name), by= c("sample_name" = "ID")) %>% rename("correlation" = sample)
  correlation_to_same_tissue_by_tissue <- rbind(correlation_to_same_tissue_by_tissue, sample_correlation) 
}

correlation_to_same_tissue_by_tissue <- correlation_to_same_tissue_by_tissue %>% 
  mutate(tissue_name = str_to_sentence(tissue_name)) %>% 
  group_by(tissue_name) %>% 
  mutate(min = min(correlation)) %>% 
  ungroup() %>% 
  arrange(min)

p1 <- correlation_to_different_organs %>%
  ggplot(aes(correlation)) +
  geom_histogram(bins = 100)  + xlim(0.5,1)+
  theme_classic() + 
  theme(panel.background = element_rect("gray90"),
        axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        axis.title.y.left = element_blank(),
        axis.title.x = element_blank()) +
  scale_y_continuous(expand = expansion(mult = 0, add = 0)) + 
  ggtitle("Different organs")

p2 <- correlation_to_same_organ %>%
  ggplot(aes(correlation)) +
  geom_histogram(bins = 100) + xlim(0.5,1) +
  theme_classic() + 
  theme(panel.background = element_rect("gray90"),
        axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        axis.title.x = element_blank()) +
  scale_y_continuous(expand = expansion(mult = 0, add = 0)) +
  ggtitle("Same organs")

p3 <- correlation_to_same_tissue %>%
  ggplot(aes(correlation)) +
  geom_histogram(bins = 100) + xlim(0.5,1) +
  theme_classic() +
  theme(panel.background = element_rect("gray90"),
        axis.title.y = element_blank(),
        axis.text.x = element_blank(),
        axis.title.x = element_blank()
        ) +
  scale_y_continuous(expand = expansion(mult = 0, add = 0)) +
  ggtitle("Same tissue")

p4 <- correlation_to_same_tissue_by_tissue %>% 
  mutate(tissue_name = factor(tissue_name, correlation_to_same_tissue_by_tissue$tissue_name %>% 
                                unique())) %>% 
  ggplot(aes(x = correlation, y = tissue_name)) +
  geom_boxplot(outlier.size=0.3) + xlim(0.5,1) +
  theme_bw() + 
  xlab ("Spearman's roh") +
  theme(axis.title.y = element_blank())

p1 / p2 / p3 / p4 + plot_layout(heights = c(1, 1, 1, 15))

ggsave("./final_plots/data_presentation/spearman_corr_sample_vs_tissue_organ.pdf", height = 14, width = 4)
```

#Specificity and distribution annotation
##Formulas
```{r}
#calculate_tau_score and hpa_gene_classification taken from Max Karlsson
calculate_tau_score <- 
  function(wide_data) {
    max_exp <- 
      apply(wide_data,
            MARGIN = 1,
            function(x) max(x, na.rm = T))
    
    N <- 
      apply(wide_data,
            MARGIN = 1,
            function(x) length(which(!is.na(x))))
    
    expression_sum <- 
      wide_data %>% 
      sweep(MARGIN = 1, 
            STATS = max_exp, 
            FUN = `/`) %>% 
      {1 - .} %>% 
      apply(MARGIN = 1,
            function(x) sum(x, na.rm = T))
    
    
    tau_score <- 
      (expression_sum / (N - 1)) %>% 
      enframe("gene", "tau_score")
    
    tau_score
  }

hpa_gene_classification <- 
  #feed in long data
  function(data, expression_col, tissue_col, gene_col, enr_fold, max_group_n, det_lim = 1) {
    data_ <- 
      data %>% 
      select(gene = gene_col,
             expression = expression_col,
             tissue = tissue_col) %>% 
      mutate(expression = round(expression, 4)) 
    
    if(any(is.na(data_$expression))) stop("NAs in expression column")
    if(any(is.na(data_$gene))) stop("NAs in gene column")
    if(any(is.na(data_$tissue))) stop("NAs in tissue column")
    
    n_groups <- length(unique(data_$tissue))
    
    gene_class_info <- 
      data_ %>%
      group_by(gene) %>%
      summarise(
        
        # Gene expression distribution metrics
        mean_exp = mean(expression, na.rm = T),
        min_exp = min(expression, na.rm = T),
        max_exp = max(expression, na.rm = T), 
        max_2nd = sort(expression)[length(expression)-1],
        
        # Expression frequency metrics
        n_exp = length(which(expression >= det_lim)),
        frac_exp = n_exp/length(expression[!is.na(expression)])*100,
        
        # Limit of enhancement metrics
        lim = max_exp/enr_fold, 
        
        exps_over_lim = list(expression[which(expression >= lim & expression >= det_lim)]),
        n_over = length(exps_over_lim[[1]]), 
        mean_over = mean(exps_over_lim[[1]]),
        min_over = ifelse(n_over == 0, NA,
                          min(exps_over_lim[[1]])),
        
        max_under_lim = max(expression[which(expression < min_over)], det_lim*0.1),
        
        
        exps_enhanced = list(which(expression/mean_exp >= enr_fold & expression >= det_lim)),
        
        
        
        
        # Expression patterns
        enrichment_group = paste(sort(tissue[which(expression >= lim & expression >= det_lim)]), collapse=";"),
        
        n_enriched = length(tissue[which(expression >= lim & expression >= det_lim)]),
        n_enhanced = length(exps_enhanced[[1]]), 
        enhanced_in = paste(sort(tissue[exps_enhanced[[1]]]), collapse=";"),
        n_na = n_groups - length(expression),
        max_2nd_or_lim = max(max_2nd, det_lim*0.1),
        tissues_not_detected = paste(sort(tissue[which(expression < det_lim)]), collapse=";"),
        tissues_detected = paste(sort(tissue[which(expression >= det_lim)]), collapse=";")) 
    
    
    gene_categories <- 
      gene_class_info %>%
      
      mutate(
        spec_category = case_when(n_exp == 0 ~ "not detected", 
                                  
                                  # Genes with expression fold times more than anything else are tissue enriched
                                  max_exp/max_2nd_or_lim >= enr_fold ~ "tissue enriched", 
                                  
                                  # Genes with expression fold times more than other tissues in groups of max group_n - 1 are group enriched
                                  max_exp >= lim &
                                    n_over <= max_group_n & n_over > 1 &
                                    mean_over/max_under_lim >= enr_fold ~ "group enriched", 
                                  
                                  # Genes with expression in tissues fold times more than the mean are tissue enhance
                                  n_enhanced > 0 ~ "tissue enhanced", 
                                  
                                  # Genes expressed with low tissue specificity
                                  T ~ "low tissue specificity"), 
        
        
        dist_category = case_when(frac_exp == 100 ~ "detected in all",
                                  frac_exp >= 31 ~ "detected in many",
                                  n_exp > 1 ~ "detected in some",
                                  n_exp == 1 ~ "detected in single",
                                  n_exp == 0 ~ "not detected"),
        
        spec_score = case_when(spec_category == "tissue enriched" ~ max_exp/max_2nd_or_lim,
                               spec_category == "group enriched" ~ mean_over/max_under_lim, 
                               spec_category == "tissue enhanced" ~ max_exp/mean_exp)) 
    
    
    
    
    ##### Rename and format
    gene_categories %>%
      mutate(enriched_tissues = case_when(spec_category %in% c("tissue enriched", "group enriched") ~ enrichment_group,
                                          spec_category == "tissue enhanced" ~ enhanced_in),
             n_enriched = case_when(spec_category %in% c("tissue enriched", "group enriched") ~ n_enriched,
                                    spec_category == "tissue enhanced" ~ n_enhanced)) %>%
      select(gene, 
             spec_category, 
             dist_category, 
             spec_score,
             n_expressed = n_exp, 
             fraction_expressed = frac_exp,
             max_exp = max_exp,
             enriched_tissues,
             n_enriched,
             n_na = n_na,
             tissues_not_detected,
             tissues_detected) 
    
  }	

```

##Annotation
```{r}

# Specificity classification at consensus level
classification_consensus <- hpa_gene_classification(data = tmm_consensus_tissue, expression_col = "tmm", tissue_col = "consensus_tissue_name", gene_col = "target_id", enr_fold = 4, max_group_n = 5, det_lim = 1)

consensus_tau <- calculate_tau_score(tmm_consensus_tissue  %>% 
                                     spread(consensus_tissue_name, tmm)%>%  
                                       mutate_if(is.numeric, function(x){log10(x+1)})%>% 
                                       column_to_rownames("target_id")) 
#category not detected has a very noisy tau, so no tau score for those
classification_consensus <- classification_consensus %>% 
  left_join(consensus_tau, by = c("gene" = "gene")) %>% 
  mutate(tau_score = ifelse(spec_category == "not detected", NA, tau_score))

#at region level
classification_region <- hpa_gene_classification(data = tmm_region_tissue, expression_col = "tmm", tissue_col = "region_tissue_name", gene_col = "target_id", enr_fold = 4, max_group_n = 5, det_lim = 1)

region_tau <- calculate_tau_score(tmm_region_tissue  %>% 
                                    spread(region_tissue_name, tmm) %>%  
                                    mutate_if(is.numeric, function(x){log10(x+1)}) %>% 
                                    column_to_rownames("target_id") )

classification_region <- classification_region %>% 
  left_join(region_tau, by = c("gene" = "gene")) %>% 
  mutate(tau_score = ifelse(spec_category == "not detected", NA, tau_score))

#at tissue level
classification_tissue <- hpa_gene_classification(data = tmm_tissue, expression_col = "tmm", tissue_col = "tissue_name", gene_col = "target_id", enr_fold = 4, max_group_n = 5, det_lim = 1)

tissue_tau <- calculate_tau_score(tmm_tissue %>% 
                                    spread(tissue_name, tmm) %>%  
                                    mutate_if(is.numeric, function(x){log10(x+1)})%>%
                                    column_to_rownames("target_id") )

classification_tissue <- classification_tissue %>% 
  left_join(tissue_tau, by = c("gene" = "gene")) %>% 
  mutate(tau_score = ifelse(spec_category == "not detected", NA, tau_score))

```

#Figure 3
##Figure 3A - Pie charts at tissue type level
```{r}
gene_category_pal <- c("Detected in all" = "#253494",
                       "Detected in many" = "#2c7fb8",
                       "Detected in some" = "#41b6c4",
                       "Detected in single" = "#a1dab4",
                       "Not detected " = "	#bebebe",
                       "Low tissue specificity" = "grey40",
                       "Tissue enhanced" = "#984ea3",
                       "Group enriched" = "#FF9D00",
                       "Tissue enriched" = "#e41a1c")

plot_data <-
  classification_tissue %>% 
  rename(specificity = spec_category, distribution = dist_category) %>% 
  select(gene, specificity, distribution) %>% 
  gather(class_type, class, -1) %>% 
  group_by(class_type, class) %>% 
  count() %>%
  group_by(class_type) %>% 
  mutate(perc = 100 * n / sum(n),
         label = paste0(n, "\n", round(perc, 1), "%")) %>% 
  ungroup() %>% 
  mutate(class = factor(str_to_sentence(class), str_to_sentence(c("tissue enriched", "group enriched",  "tissue enhanced", "low tissue specificity", "detected in all", "detected in many", "detected in some", "detected in single", "not detected"))),
         class_type = factor(str_to_sentence(class_type),
                             c("Specificity", "Distribution")))
    
plot_dodge = 0.1
plot_data %>% 
  arrange(match(class, rev(levels(class)))) %>% 
  group_by(class_type) %>% 
  mutate(y_stack = cumsum(n) - n/2) %>% 
  {ggplot(., aes(1, n, fill = class, group = class, 
                 label = label)) +
      geom_col(show.legend = F, 
               color = "white", 
               width = 1) + 
      geom_segment(aes(x = 1.5 + plot_dodge, xend = 1.5, 
                       y = y_stack, yend = y_stack), size = 0.5) +
      geom_text_repel(aes(x = 1.5 + plot_dodge, y = y_stack), 
                      color = "black", nudge_x = plot_dodge, 
                      segment.size = 0.5, size = 24/11) +
      scale_fill_manual(values = gene_category_pal) + 
      facet_wrap(~class_type) +
      coord_polar("y",start = 0) +
      theme_void() + 
      scale_x_continuous(expand = expansion(c(0,0.8)))}


ggsave("./final_plots/classification/class_pies_tissue_type.pdf",width = 6, height = 5)

```
##Figure 3B - Distibution for each tissue type
```{r}

classification_tissue %>% group_by(dist_category) %>% count()

ordered_names_distr <- classification_tissue %>% 
  filter(dist_category %in% c("detected in single", "detected in some", "detected in many", "detected in all")) %>% 
  separate_rows(tissues_detected, sep = ";") %>% 
  group_by(dist_category, tissues_detected) %>% 
  count() %>% 
  group_by(tissues_detected) %>% 
  summarise(sum = sum(n)) %>% 
  arrange(sum) %>% 
  .$tissues_detected %>% 
  str_to_sentence()

detection_palette <- c("Detected in all" = "#253494",
                        "Detected in many" = "#2c7fb8",
                        "Detected in some" = "#41b6c4",
                        "Detected in single" = "#a1dab4",
                        "Not detected " = "grey")


classification_tissue %>%
  filter(
    dist_category %in% c(
      "detected in single",
      "detected in some",
      "detected in many",
      "detected in all"
    )
  ) %>%
  separate_rows(tissues_detected, sep = ";") %>%
  group_by(dist_category, tissues_detected) %>%
  count() %>%
  mutate(tissues_detected = factor(str_to_sentence( tissues_detected), ordered_names_distr)) %>%
  mutate(dist_category = factor(
    str_to_sentence( dist_category),
    c(
      "Detected in single",
      "Detected in some",
      "Detected in many",
      "Detected in all"
    )
  )) %>%
  ggplot(aes(x = n, y = tissues_detected, fill = dist_category)) +
  geom_col() +
  scale_fill_manual(values = detection_palette) +
  theme_classic() + theme(
    axis.title.y = element_blank(),
    axis.line.y = element_blank(),
    axis.title.x = element_blank(),
    legend.position = "bottom",
    legend.title = element_blank()
  ) +
  scale_x_continuous(expand = expansion(mult = 0, add = 0)) +
  ggtitle("Number of detected genes per tissue type")  +
  guides(fill = guide_legend(nrow = 2, byrow = TRUE))

 ggsave("./final_plots/classification/tissue_detection_a.pdf", height = 11.5, width = 4.5)


```

##Figure 3C - Distribution and Tau score
```{r}
detection_palette <- c("Detected in all" = "#253494",
                        "Detected in many" = "#2c7fb8",
                        "Detected in some" = "#41b6c4",
                        "Detected in single" = "#a1dab4",
                        "Not detected " = "grey")

p1 <- classification_tissue %>% 
  mutate(dist_category = factor(str_to_sentence( dist_category), levels = rev(names(detection_palette))),
         enriched_tissues = str_to_sentence(enriched_tissues)) %>% 
  filter(dist_category != "Not detected") %>% 
  ggplot(aes(x = tau_score, y = dist_category, fill = dist_category)) +
  geom_violin() +
  scale_fill_manual(values = gene_category_pal, name = "Specificity") +
  xlab("Tau score") +
  theme_bw() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
    axis.title.y = element_blank(), 
    axis.text.y = element_blank(), 
    axis.ticks.y = element_blank()
    ) 

#ggsave("./final_plots/classification/tau_to_distribution.pdf",width = 5.5, height = 4)


p2 <- classification_tissue %>%
  ggplot(aes(tau_score)) +
  geom_histogram(bins = 100) +
  theme_classic() +
  ylab("Count")+
  theme(panel.background = element_rect("gray90"),
        #axis.title.y = element_blank(),
        axis.text.x = element_blank(),
        axis.title.x = element_blank()) +
  scale_y_continuous(expand = expansion(mult = 0, add = 0))

p2/ p1 + plot_layout(heights = c(1, 3))

ggsave("./final_plots/classification/distribution_and_dendrogram_tissue_w_overall.pdf",width = 6.5, height = 6)
```
###Extra Figures not in report
```{r}
detection_palette <- c("detected in all" = "#253494",
                        "detected in many" = "#2c7fb8",
                        "detected in some" = "#41b6c4",
                        "detected in single" = "#a1dab4",
                        "not detected " = "grey")

ordered_names_distr <- classification_tissue %>%  
  filter(dist_category %in% c("detected in single"#,
                             # "detected in some"
                              )) %>% 
  separate_rows(tissues_detected, sep = ";") %>% 
  group_by(dist_category, tissues_detected) %>% 
  count() %>% 
  group_by(tissues_detected) %>% 
  summarise(sum = sum(n)) %>% 
  arrange(sum) %>% 
  .$tissues_detected

classification_tissue %>%  
  filter(
    dist_category %in% c(
      "detected in single"#,
#
    )
  ) %>%
  separate_rows(tissues_detected, sep = ";") %>%
  group_by(dist_category, tissues_detected) %>%
  count() %>%
  mutate(tissues_detected = factor(tissues_detected, ordered_names_distr)) %>%
  mutate(dist_category = factor(
    dist_category,
    c(
      "detected in single",
      "detected in some",
      "detected in many",
      "detected in all"
    )
  )) %>%
  ggplot(aes(x = n, y = tissues_detected, fill = dist_category)) +
  geom_col() +
  scale_fill_manual(values = detection_palette) +
  theme_classic() + theme(
    axis.title.y = element_blank(),
    axis.line.y = element_blank(),
    axis.title.x = element_blank(),
    legend.position = "bottom",
    legend.title = element_blank()
  ) +
  scale_x_continuous(expand = expansion(mult = 0, add = 0)) +
#  ggtitle("Number of detected genes per tissue type")  +
  ggtitle("Number of detected genes detected in a single tissue")  +
  guides(fill = guide_legend(nrow = 2, byrow = TRUE))

```
```{r}
detection_palette <- c("detected in all" = "#253494",
                        "detected in many" = "#2c7fb8",
                        "detected in some" = "#41b6c4",
                        "detected in single" = "#a1dab4",
                        "not detected " = "grey")

ordered_names_distr <- classification_tissue %>%  
  filter(dist_category %in% c("detected in single",
                              "detected in some"
                              )) %>% 
  separate_rows(tissues_detected, sep = ";") %>% 
  group_by(dist_category, tissues_detected) %>% 
  count() %>% 
  group_by(tissues_detected) %>% 
  summarise(sum = sum(n)) %>% 
  arrange(sum) %>% 
  .$tissues_detected

classification_tissue %>%  
  filter(
    dist_category %in% c(
      "detected in single",
      "detected in some"
    )
  ) %>%
  separate_rows(tissues_detected, sep = ";") %>%
  group_by(dist_category, tissues_detected) %>%
  count() %>%
  mutate(tissues_detected = factor(tissues_detected, ordered_names_distr)) %>%
  mutate(dist_category = factor(
    dist_category,
    c(
      "detected in single",
      "detected in some",
      "detected in many",
      "detected in all"
    )
  )) %>%
  ggplot(aes(x = n, y = tissues_detected, fill = dist_category)) +
  geom_col() +
  scale_fill_manual(values = detection_palette) +
  theme_classic() + theme(
    axis.title.y = element_blank(),
    axis.line.y = element_blank(),
    axis.title.x = element_blank(),
    legend.position = "bottom",
    legend.title = element_blank()
  ) +
  scale_x_continuous(expand = expansion(mult = 0, add = 0)) +
  ggtitle("Number of detected genes per tissue type")  +
 # ggtitle("Number of detected genes detected in a single tissue")  +
  guides(fill = guide_legend(nrow = 2, byrow = TRUE))

```

#Figure 4
##Figure 4A - Pie charts at grouped tissue level
```{r}
gene_category_pal <- c("Detected in all" = "#253494",
                       "Detected in many" = "#2c7fb8",
                       "Detected in some" = "#41b6c4",
                       "Detected in single" = "#a1dab4",
                       "Not detected " = "	#bebebe",
                       "Low tissue specificity" = "grey40",
                       "Tissue enhanced" = "#984ea3",
                       "Group enriched" = "#FF9D00",
                       "Tissue enriched" = "#e41a1c")

plot_data <-
  classification_consensus %>% 
  rename(specificity = spec_category, distribution = dist_category) %>% 
  select(gene, specificity, distribution) %>% 
  gather(class_type, class, -1) %>% 
  group_by(class_type, class) %>% 
  count() %>%
  group_by(class_type) %>% 
  mutate(perc = 100 * n / sum(n),
         label = paste0(n, "\n", round(perc, 1), "%")) %>% 
  ungroup() %>% 
  mutate(class = factor(str_to_sentence(class), str_to_sentence(c("tissue enriched", "group enriched",  "tissue enhanced", "low tissue specificity", "detected in all", "detected in many", "detected in some", "detected in single", "not detected"))),
         class_type = factor(str_to_sentence(class_type),
                             c("Specificity", "Distribution")))
    
plot_dodge = 0.1
plot_data %>% 
  arrange(match(class, rev(levels(class)))) %>% 
  group_by(class_type) %>% 
  mutate(y_stack = cumsum(n) - n/2) %>% 
  {ggplot(., aes(1, n, fill = class, group = class, 
                 label = label)) +
      geom_col(show.legend = F, 
               color = "white", 
               width = 1) + 
      geom_segment(aes(x = 1.5 + plot_dodge, xend = 1.5, 
                       y = y_stack, yend = y_stack), size = 0.5) +
      geom_text_repel(aes(x = 1.5 + plot_dodge, y = y_stack), 
                      color = "black", nudge_x = plot_dodge, 
                      segment.size = 0.5, size = 24/11) +
      scale_fill_manual(values = gene_category_pal) + 
      facet_wrap(~class_type) +
      coord_polar("y",start = 0) +
      theme_void() + 
      scale_x_continuous(expand = expansion(c(0,0.8)))}


ggsave("./final_plots/classification/class_pies_grouped_tissue.pdf",width = 6, height = 5)
```

##Figure 4B - Speceficity for each grouped tissue
```{r}
classification_consensus %>% group_by(spec_category) %>% count() 

ordered_names_sp <-
  classification_consensus %>%
  filter(spec_category %in% c("group enriched", "tissue enriched", "tissue enhanced")) %>%
  separate_rows(enriched_tissues, sep = ";") %>%
  group_by(spec_category, enriched_tissues) %>%
  count() %>%
  ungroup() %>% 
  group_by(enriched_tissues) %>%
  summarise(sum = sum(n)) %>%
  ungroup() %>% 
  arrange(sum) %>%
  .$enriched_tissues %>% 
  str_to_sentence()
  
  
  

specificity_palette <- rev(c("Not detected" = "grey",
                           "Low tissue specificity" = "grey40",
                           "Tissue enhanced" = "#984ea3",
                           "Group enriched" = "#FF9D00",
                           "Tissue enriched" = "#e41a1c"))

classification_consensus %>%
  filter(spec_category %in% c("group enriched", "tissue enriched", "tissue enhanced")) %>%
  separate_rows(enriched_tissues, sep = ";") %>%
  group_by(spec_category, enriched_tissues) %>%
  count() %>%
  mutate(enriched_tissues = factor(str_to_sentence(enriched_tissues), ordered_names_sp)) %>%
  mutate(spec_category = factor(str_to_sentence(spec_category), c("Tissue enriched", "Group enriched",  "Tissue enhanced"))) %>%
  ggplot(aes(x = n, y = enriched_tissues, fill = spec_category)) +
  geom_col() +
  scale_fill_manual(values = specificity_palette) +
  xlab("Number of genes")+
  theme_classic() + theme(
    axis.title.y = element_blank(),
  #  axis.title.x = element_blank(),
    axis.line.y = element_blank(),
    legend.position = "bottom",
    legend.title = element_blank()
  ) +
  scale_x_continuous(expand = expansion(mult = 0, add = 0)) +
  ggtitle("Specificity category per consensus tissue") 

ggsave("./final_plots/classification/consensus_tissue_specificity.pdf", height = 7, width = 5.5)
```


##Figure 4C - Specificity and Tau score
```{r}


specificity_palette <- rev(c("Not detected" = "grey",
                           "Low tissue specificity" = "grey40",
                           "Tissue enhanced" = "#984ea3",
                           "Group enriched" = "#FF9D00",
                           "Tissue enriched" = "#e41a1c"))

p1 <- classification_consensus %>% 
  mutate(spec_category = factor(str_to_sentence( spec_category), levels = names(specificity_palette)),
         enriched_tissues = str_to_sentence(enriched_tissues)) %>% 
  filter(spec_category != "Not detected") %>% 
  ggplot(aes(x = tau_score, y = spec_category, fill = spec_category)) +
  geom_violin() +
  scale_fill_manual(values = gene_category_pal, name = "Specificity") +
  xlab("Tau score") +
  theme_bw() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
    axis.title.y = element_blank(), 
    axis.text.y = element_blank(), 
    axis.ticks.y = element_blank()
    ) 

#ggsave("./final_plots/classification/tau_to_specificity.pdf",width = 5.5, height = 4)


p2 <- classification_consensus %>%
  ggplot(aes(tau_score)) +
  geom_histogram(bins = 100) +
  theme_classic() +
  ylab("Count")+
  theme(panel.background = element_rect("gray90"),
        #axis.title.y = element_blank(),
        axis.text.x = element_blank(),
        axis.title.x = element_blank()) +
  scale_y_continuous(expand = expansion(mult = 0, add = 0))

p2/ p1 + plot_layout(heights = c(1, 3))

ggsave("./final_plots/classification/specificity_and_dendrogram_consensus_w_overall.pdf",width = 7, height = 6)

```

##Figure 4D - Gene annotation alluvial plot
```{r}
width = 0.1

alluv_1 <-
  
  classification_consensus %>%
  mutate(Specificity = str_to_sentence(spec_category),
          Distribution = str_to_sentence(dist_category)) %>%
  select(Specificity,
         Distribution) %>%
  mutate(row_n = row_number()) %>%
  gather(bar, chunk, -row_n) %>%
  mutate(color_vars = 1) %>%
  group_by(row_n) %>%
  mutate(chunk_color = chunk[match(c("Specificity",
                                     "Distribution")[color_vars], bar)]) %>% 
  ungroup() %>%
  
  
  mutate(chunk = factor(chunk, levels = c('Tissue enriched', 'Group enriched', 
                                          'Tissue enhanced', 'Low tissue specificity', 
                                          'Detected in single',
                                          'Detected in some', 
                                          'Detected in many', 
                                          'Detected in all',
                                          'Not detected')),
         bar = factor(bar, levels = c("Specificity",
                                      "Distribution"))) %>%
  
  
  ggplot(aes(x = bar, stratum = chunk, alluvium = row_n,
             y = 1)) +
  
  geom_flow(aes(fill = chunk_color), 
            show.legend = F, width = width,
            knot.pos = 1/6) +
  geom_stratum(aes(fill = chunk), 
               show.legend = F, color = NA, width = width) +
  
  scale_x_discrete(expand = c(.1, .1), position = "top") +
  scale_fill_manual(values = c(gene_category_pal)) + 
  
  
  theme(axis.text.y = element_text(size = 18, face = "bold"),
        axis.text.x = element_blank(), 
        axis.ticks = element_blank(), 
        panel.background = element_blank(), 
        axis.title = element_blank()) +
  coord_flip()

# alluv_1

flow_data <-
  ggplot_build(alluv_1)$data[[1]] %>%
  as_tibble() %>%
  {
    if ("side" %in% names(.)) {
      .
    } else{
      mutate(.,
             side = case_when(flow == "from" ~ "start",
                              flow == "to" ~ "end"))
    }
  }



stratum_data <- 
  ggplot_build(alluv_1)$data[[2]]

flow_data_labels <-
  flow_data %>%
  as_tibble() %>%
  select(x, stratum, group, side, ymin, ymax) %>%
  pivot_wider(names_from = side,
              values_from = c(x, stratum, ymin, ymax)) %>%
  mutate_at(
    c(
      "x_end",
      "ymax_end",
      "ymin_end",
      "x_start",
      "ymax_start",
      "ymin_start"
    ),
    as.numeric
  ) %>%
  group_by(stratum_start, stratum_end, x_start, x_end) %>%
  summarise(
    y_end = (min(ymin_end) + max(ymax_end)) / 2,
    y_start = (min(ymin_start) + max(ymax_start)) / 2,
    size = max(ymax_start) - min(ymin_start)
  )

alluv_2 <- 
  alluv_1 +
  geom_text(data = flow_data_labels,
            aes(x = x_start + width/2,
                y = y_start, 
                label = size), 
            inherit.aes = F, 
            size = 3,
            angle = -90,
            hjust = 1,
            vjust = 0.5) +
  geom_text(data = flow_data_labels,
            aes(x = x_end - width/2,
                y = y_end, 
                label = size), 
            inherit.aes = F, 
            size = 3, 
            angle = -90,
            hjust = 0,
            vjust = 0.5) +
  
  # Stratum label
  
  geom_text(data = stratum_data %>%
              filter(x == 1),
            aes(x = x - width/2,
                y = y,
                label = stratum),
            size = 4, 
            vjust = 1.5,
            inherit.aes = F) + 
  geom_text(data = stratum_data %>%
              filter(x == 2),
            aes(x = x + width/2,
                y = y,
                label = stratum),
            size = 4, 
            vjust = -0.5,
            inherit.aes = F) + 
  
  geom_text(data = stratum_data,
            aes(x = x, 
                y = y,
                label = ymax - ymin), 
            size = 4, 
            fontface = "bold",
            color = "white",
            inherit.aes = F)


alluv_2
ggsave("./final_plots/classification/alluvial_classification.pdf",width = 8, height = 3)

```
###Extra figures not in report
```{r}
organ_colors <- metadata %>% select(consensus_tissue_name, consensus_tissue_color) %>% unique()
pal <-  organ_colors$consensus_tissue_color
pal <- setNames(pal, str_to_sentence(organ_colors$consensus_tissue_name))


specificity_palette <- rev(c("Not detected" = "grey",
                           "Low tissue specificity" = "grey40",
                           "Tissue enhanced" = "#984ea3",
                           "Group enriched" = "#FF9D00",
                           "Tissue enriched" = "#e41a1c"))


class_table_temp <-
  classification_consensus %>%
  select(gene, spec_category, enriched_tissues) %>%
  separate_rows(enriched_tissues, sep = ";") %>%
  mutate(spec_category = factor(str_to_sentence( spec_category), levels = names(specificity_palette)),
         enriched_tissues = str_to_sentence(enriched_tissues))

plot_dendro <-
  tmm_consensus_tissue %>% 
  select(target_id, consensus_tissue_name, tmm) %>% 
  mutate(consensus_tissue_name = str_to_sentence(consensus_tissue_name)) %>%
  spread(target_id, tmm) %>%
  column_to_rownames("consensus_tissue_name")  %>%
  t() %>%
  cor(method = "spearman") %>%
  
  {1 - .} %>%
  as.dist() %>% 
  hclust(method = "average") %T>%
  plot %>%
  dendro_data()
  
  
dendro_plot_data <- 
  left_join(plot_dendro$segments, 
            plot_dendro$labels, 
            by = c("x" = "x", "yend" = "y")) 

left_plot <- 
  dendro_plot_data %>%
  ggplot() +
  geom_segment(aes(x=y, y=x, xend=yend, yend=xend, group = label))+
  geom_rect(aes(xmin=0, ymin=x + 0.5, 
                xmax=-0.02, ymax=xend - 0.5, 
                fill = label), 
            show.legend = F) +
  scale_color_manual(values = pal)+
  scale_fill_manual(values = pal)+
  scale_x_reverse(expand = expansion(0), position = "top")+
  scale_y_continuous(expand = expansion()) +
  xlab("1 - Spearman's rho") +
  
  theme(axis.text.y = element_blank(), 
        axis.title.y = element_blank(), 
        axis.ticks.y = element_blank(),
        plot.margin = unit(c(1,1,1,1), units = "mm"), 
        panel.background = element_blank()) 

right_plot <- 
  class_table_temp %>%
  filter(!is.na(enriched_tissues)) %>%
  group_by(enriched_tissues, spec_category) %>%
  summarise(n_genes = n()) %>%
  ungroup() %>%
  mutate(enriched_tissues = factor(enriched_tissues, levels = plot_dendro$labels$label),
         spec_category = factor(spec_category, names(specificity_palette))) %>%
  ggplot(aes(n_genes, enriched_tissues, fill = spec_category)) +
  geom_col(width = 0.8, size = 0.1) +
  theme_bw() + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  scale_fill_manual(values = gene_category_pal, name = "Specificity") +
  # coord_flip() +
  xlab("Number of genes") +
  
  scale_x_continuous(position = "top", expand = expansion(c(0, 0.1))) + 
  scale_y_discrete(expand = expansion()) +
  
  theme(axis.text.y = element_text(hjust = 0.5), 
        legend.position = c(0.7, 0.5),
        axis.title.y = element_blank(),
        panel.border = element_blank())

left_plot + right_plot +
  plot_layout(widths = c(0.3, 1))


ggsave("./final_plots/classification/specificity_and_dendrogram_consensus.pdf",width = 7, height = 7)

```
#Figure 5
```{r}
##Plot based on Max Karlsson's network plot

classification_network_plot_2 <- 
  function(class_table, gene_col, spec_col, enriched_col, spec_filter, pal, savename, enriched_sep = ";", 
           node_filter_rank = 2, node_filter_show_cat = c("tissue enriched"), 
           node_filter_min = 2, node_filter_n_show = 8, scale_factor = 1) {
    
    enrichment_table <- 
      class_table %>% 
      select(gene = gene_col, 
             spec = spec_col, 
             enriched = enriched_col) %>% 
      filter(spec %in% spec_filter) %>% 
      group_by(enriched, spec) %>% 
      summarise(n_genes = n()) %>%
      ungroup()
    
    net_data <-
      enrichment_table %>% 
      mutate(all_enriched = enriched) %>%
      separate_rows(enriched, sep = enriched_sep) %>%
      group_by(enriched, spec) %>% 
      mutate(rank = rank(-n_genes, ties.method = "min")) %>%
      group_by(all_enriched) %>%
      mutate(any_low_rank = any(rank <= node_filter_rank)) %>%
      ungroup() %>%
      
      filter((spec == node_filter_show_cat | any_low_rank | n_genes >= node_filter_n_show) &
               (spec == node_filter_show_cat | n_genes >= node_filter_min)) %>% 
      mutate(edge_id = paste("enriched:", all_enriched)) %>% 
      arrange(n_genes)
    
    net_edges <- 
      net_data %$% 
      tibble(node1 = enriched, node2 = edge_id, n = n_genes) %>% 
      unique()
    
    g <-
      net_edges %>%
      graph_from_data_frame(directed = FALSE) %>%
      ggraph(layout = "kk") 
    
    link_map <- 
      net_edges %>% 
      gather(node, id, -(3)) %>% 
      mutate(tissue_node = node == "node1", 
             color_id = case_when(tissue_node ~ id, 
                                  grepl(";", id) ~ "Group enriched",
                                  !grepl(";", id) ~ "Tissue enriched"), 
             label = ifelse(tissue_node, color_id, n)) %>%
      select(n, node, id, tissue_node, color_id, label) %>%
      unique()
    
    
    edge_data <- get_edges()(g$data)
    node_data <- 
      get_nodes()(g$data) %>% 
      as_tibble() %>%
      left_join(link_map, 
                by = c("name" = "id")) 
    
    
    fig <- 
      g + 
      geom_edge_arc(aes(width = n), 
                    color = "gray", 
                    strength = 0, 
                    alpha = 0.5,
                    show.legend = F) + 
      scale_edge_alpha_continuous(range = c(0.3, 1)) +
      scale_edge_width_continuous(range = c(1, 3)) +
      
      geom_node_point(data = node_data  %>%
                        filter(!tissue_node),
                      aes(size = log10(n), 
                          fill = color_id), 
                      stroke = 1,
                      # size = 10,
                      shape = 21,
                      show.legend = F)+
      geom_node_point(data = node_data %>%
                        filter(tissue_node),
                      aes(fill = color_id), 
                      stroke = 1,
                      size = 20 * scale_factor,
                      shape = 21,
                      show.legend = F)+
      geom_node_text(data = node_data,
                     aes(label = label),
                     size = 4 * scale_factor) +
      scale_size_continuous(range = c(5, 10) * scale_factor) +
      scale_fill_manual(values = pal) +
      
      theme_void()
    
    ## ----- Save
    
    
    cyto_summary <- 
      net_edges %>% 
      mutate(category = ifelse(!grepl(enriched_sep, node2), "Tissue enriched", "Group enriched"), 
             node_id = unclass(factor(node2)),
             node1 = str_to_sentence(node1), 
             n_sqrt = sqrt(n), 
             str_len = str_length(node1)) %>%
      select(category, node1, node2, node_id, n, n_sqrt, str_len) 
    
    cyto_summary %>% 
      write_delim("cytoscape nodes summary.txt", delim = "\t")
  #    write_delim(savepath(paste(savename, "cytoscape nodes summary.txt")), delim = "\t")
    
    bind_rows(cyto_summary %>% 
                left_join(pal %>% 
                            enframe("node1", "color")) %>% 
                select(node_id = node1, 
                       color) %>% 
                unique() %>%
                mutate(node_type = "Tissue"),
              cyto_summary %>% 
                mutate(color = case_when(category == "Tissue enriched" ~ "#e41a1c",
                                         category == "Group enriched" ~ "#FF9D00"), 
                       node_id = as.character(node_id)) %>%
                select(node_id, color) %>% 
                unique() %>%
                
                mutate(node_type = "Enrichment")) %>%
      mutate(color2 = case_when(node_type == "Enrichment" ~ color,
                                node_type == "Tissue" ~ "#D3D3D3FF"),
             color3 = case_when(node_type == "Enrichment" ~ color,
                                node_type == "Tissue" ~ "#BEBEBEFF")) %>% 
      
      write_delim("cytoscape nodes color.txt", delim = "\t") 
      #write_delim(savepath(paste(savename, "cytoscape nodes color.txt")), delim = "\t") 
    
    bind_rows(cyto_summary %>% 
                select(node_id = node1) %>% 
                mutate(label = node_id) %>%
                unique(),
              cyto_summary %>% 
                mutate(node_id = as.character(node_id), 
                       label = as.character(n)) %>%
                select(node_id, label) %>% 
                unique()) %>% 
      write_delim("cytoscape nodes label whole group.txt",
      #write_delim(savepath(paste(savename, "cytoscape nodes label whole group.txt")), 
                  delim = "\t")
    
    ## ----
    
    fig
  }
```

```{r}


organ_colors <- metadata %>% select(consensus_tissue_name, organ_color) %>% unique()
pal1 <-  organ_colors$organ_color
pal1 <- setNames(pal1, organ_colors$consensus_tissue_name)

specificity_palette <- rev(c("Not detected" = "grey",
                             "Tissue" = "grey",
                           "Low tissue specificity" = "grey40",
                           "Tissue enhanced" = "#984ea3",
                           "Group enriched" = "#FF9D00",
                           "Tissue enriched" = "#e41a1c"))

library(influential)

classification_network_plot_2(
  class_table = classification_consensus %>% mutate(spec_category = str_to_sentence(spec_category)),
  gene_col = "gene",
  spec_col = "spec_category",
  enriched_col = "enriched_tissues",
  spec_filter = c("Tissue enriched", "Group enriched"),
  pal = c(pal1, specificity_palette),
  savename = "test_interconsensus",
  enriched_sep = ";",
  node_filter_rank = 2,
  node_filter_show_cat = c("Tissue enriched"),
  node_filter_min = 2,
  node_filter_n_show = 5,
  scale_factor = 0.8
) 


ggsave("./final_plots/data_presentation/nework_plot2-filter.pdf", height = 20, width = 20)
```

#Read Comparison data
```{r}
comp_metadata <- read_csv("./data/final_data/comparison_metadata-init-1-0.csv")

human_atlas_tissue <-
  read_delim("./data/human_hpa/rna_tissue_consensus.tsv", delim = "\t") %>%
  mutate(source = "hpa_consensus") %>%
  group_by(Gene) %>%
  mutate(nx = case_when(nTPM == 0 ~ 0,
                        T ~ nTPM / sqrt(sd(nTPM)))) %>%
  ungroup() #%>% 
  # # just necessary if read data of multiple different sources, then would make sense to average out samples named the same
  # # just from one source atm, so no need for this.
  # group_by(Tissue, Gene) %>%
  # summarise(nTPM = mean(nTPM, na.rm = T)) %>% 
  # ungroup()

hpa_comp <- human_atlas_tissue %>% left_join(
  comp_metadata %>%
    select(hpa_consensus_name, comparison_tissue_name) %>%
    distinct() %>%
    filter(!is.na(hpa_consensus_name)) %>%
    filter(!is.na(comparison_tissue_name)),
  by = c("Tissue" = "hpa_consensus_name")) %>% 
  filter(!is.na(comparison_tissue_name)) %>% 
  group_by(Gene,comparison_tissue_name) %>% 
  summarise(tmm = max(nTPM),
            nx = max(nx)) %>% 
  ungroup() %>% 
  rename(tissue = comparison_tissue_name)

rat_tissue <-
  read_csv("./data/final_data/final_tmm_tissue_name.csv") %>% 
  # #omit gather, data should be  already in long format
  # gather(sample, tmm,-1) %>%
  group_by(target_id) %>%
  mutate(nx = case_when(tmm == 0 ~ 0,
                        T ~ tmm / sqrt(sd(tmm)))) %>%
  ungroup()
  

# select tissues and pool tissues that will get compared
rat_tissue_comp <- rat_tissue %>% left_join(comp_metadata %>% 
                                              select(tissue_name, comparison_tissue_name), by = c("tissue_name" = "tissue_name")) %>%
  filter(!is.na(comparison_tissue_name)) %>% 
  group_by(target_id, comparison_tissue_name) %>% 
  summarise(tmm = max(tmm),
            nx = max(nx)) %>% 
  ungroup() %>% 
  rename(tissue = comparison_tissue_name)

#check if all have the same tissue
all(rat_tissue_comp %>% distinct(tissue) == hpa_comp %>% distinct(tissue) )

```
#Read ortholog data
```{r}
#Select either "HPA" or "ensemble"
ortholog_data_from <- "HPA"
#ortholog_data_from <- "ensemble"

include_many2many <- TRUE
include_one2many <- TRUE

#Only needed for ensembl:
#define ensemble version
version     <- 103
organism_db <- "rnorvegicus_gene_ensembl"
#mart <- useEnsembl ( biomart="genes" , dataset=organism_db , version=version )

if (ortholog_data_from == "HPA"){
  homolog_data <- read.table("./data/human_hpa/kalle/ensembl_rat_ortholog.tsv", sep = "\t", header = TRUE) %>% 
    filter(ensrnog_id %in% rat_tissue$target_id) %>% 
    filter(ensg_id %in% human_atlas_tissue$Gene)
} else if (ortholog_data_from == "ensemble"){
  homolog_data <- getBM( attributes=c( "ensembl_gene_id", "hsapiens_homolog_ensembl_gene", "hsapiens_homolog_orthology_type") , mart=mart ) %>%
    filter(hsapiens_homolog_ensembl_gene != "") %>% 
    filter(ensembl_gene_id %in% rat_tissue$target_id) %>% 
    filter(hsapiens_homolog_ensembl_gene %in% human_atlas_tissue$Gene) %>% 
    rename(ensrnog_id = ensembl_gene_id, ensg_id = hsapiens_homolog_ensembl_gene, ortholog_type = hsapiens_homolog_orthology_type)
}

if (include_many2many == FALSE){
  homolog_data <- homolog_data %>% filter(!ortholog_type == "ortholog_many2many")
}
if (include_one2many == FALSE){
  homolog_data <- homolog_data %>% filter(!ortholog_type == "ortholog_one2many")
}

```


#Figure 6
```{r}
comparison_colors_tbl <- rbind(
  comp_metadata %>% select(name = comparison_tissue_name, color = consensus_tissue_color) %>% distinct(), 
  comp_metadata %>% select(name = tissue_name, color = tissue_color) %>%  distinct()
  ) %>% 
  distinct() %>% 
  drop_na() %>% 
  mutate(name = str_to_sentence(name))

pal <- comparison_colors_tbl$color
pal <- setNames(pal, str_to_sentence(comparison_colors_tbl$name))


plot_data1 <- 
  comp_metadata %>% 
  select(tissue_name, comparison_tissue_name, organ_name) %>% 
  unique() %>% 
  drop_na() %>% 
  mutate(tissue_name = str_to_sentence(tissue_name), comparison_tissue_name = str_to_sentence(comparison_tissue_name)) %>% 
  arrange(organ_name, comparison_tissue_name) %>% 
  mutate(plot_order = row_number()) %>% select(-organ_name)

plot_data2 <- 
  plot_data1 %>%
  gather(column, label, -plot_order) %>%
  group_by(label, column) %>% 
  summarise(plot_order = mean(plot_order))  %>%
  ungroup() %>% 
  mutate(label = label,
         column = factor(column,
                         c("tissue_name", 
                           "comparison_tissue_name"
                           )))
plot_data3 <- 
  plot_data1 %>% 
  group_by(comparison_tissue_name) %>% 
  mutate(left_pos = mean(plot_order))

plot_data4 <- 
  plot_data2 %>% 
  left_join(comparison_colors_tbl,
            by = c("label" = "name")) %>% 
  group_by(column, label) %>% 
  summarise(miny = min(plot_order) - 0.5,
            maxy = max(plot_order) + 0.5)

ggplot() +
  geom_rect(data = plot_data4, 
           aes(xmin = column, xmax = column, ymin = miny, ymax =  maxy, fill = label), 
           show.legend = F
         ) +
    geom_rect(data = plot_data4 %>% filter (column == "comparison_tissue_name"), 
          # aes(xmin = column, xmax = column, ymin = miny, ymax = maxy, color = color), 
           aes(xmin = 2, xmax = 2.5, ymin = miny, ymax =  maxy, fill = label), 
           show.legend = F
         ) +
    geom_rect(data = plot_data4 %>% filter (column == "tissue_name"), 
          # aes(xmin = column, xmax = column, ymin = miny, ymax = maxy, color = color), 
           aes(xmin = 0.5, xmax = 1, ymin = miny, ymax =  maxy, fill = label), 
           show.legend = F, width = 10
         ) +
  geom_segment(
    data = plot_data3,
    aes(
      x = "comparison_tissue_name",
      xend = "tissue_name",
      y = left_pos,
      yend = plot_order,
      color = tissue_name
    ),
    show.legend = F,
    alpha = 0.5,
    size = 2
  )+
  geom_text(
    data = plot_data2 %>% filter(column == "comparison_tissue_name"),
    aes(x = column, y = plot_order, label = label),
    hjust = 0,
    size = 2 * 5 / 6,
    label.padding = unit(0, "mm")
  ) +
  geom_text(
    data = plot_data2 %>% filter(column == "tissue_name"),
    aes(x = column, y = plot_order, label = label),
    hjust = 1,
    size = 2 * 5 / 6,
    show.legend = F,
    label.padding = unit(0, "mm")
  ) +
  # geom_label(
  #   data = plot_data2 %>%
  #     filter(column == "organ_name"),
  #   #aes(x = column, y = plot_order, label = label),
  #   aes(x = column, y = plot_order, label = gsub(" ", "\n", label)),
  #   show.legend = F,
  #   label.size = 0,
  #   hjust = 1,
  #   lineheight = 0.7,
  #   label.padding = unit(0, "mm"),
  #   size = 2 * 5 / 6
  # ) +
  scale_y_reverse() +
  scale_x_discrete(labels  = c("Rat tissue", "Comparison tissue"),position = "top") +
  scale_fill_manual(values = pal) +
  scale_color_manual(values = pal) +
  theme_void() +
  theme(axis.text.x = element_text())
ggsave("final_plots/comparison/comparison_tissues_rat.pdf", height = 5, width = 4)
```
```{r}
comparison_colors_tbl <- rbind(
  comp_metadata %>% select(name = comparison_tissue_name, color = consensus_tissue_color) %>% distinct(), 
  comp_metadata %>% select(name = hpa_consensus_name, color = tissue_color) %>%  distinct()
  ) %>% 
  distinct() %>% 
  drop_na() %>% 
  mutate(name = str_to_sentence(name))

pal <- comparison_colors_tbl$color
pal <- setNames(pal, str_to_sentence(comparison_colors_tbl$name))


plot_data1 <- 
  comp_metadata %>% 
  select(hpa_consensus_name, comparison_tissue_name, organ_name) %>% 
  unique() %>% 
  drop_na() %>% 
  mutate(hpa_consensus_name = str_to_sentence(hpa_consensus_name), comparison_tissue_name = str_to_sentence(comparison_tissue_name)) %>% 
  arrange(organ_name, comparison_tissue_name) %>% 
  mutate(plot_order = row_number()) %>% select(-organ_name)

plot_data2 <- 
  plot_data1 %>%
  gather(column, label, -plot_order) %>%
  group_by(label, column) %>% 
  summarise(plot_order = mean(plot_order))  %>%
  ungroup() %>% 
  mutate(label = label,
         column = factor(column,
                         c("comparison_tissue_name", "hpa_consensus_name"
                           )))
plot_data3 <- 
  plot_data1 %>% 
  group_by(comparison_tissue_name) %>% 
  mutate(left_pos = mean(plot_order))

plot_data4 <- 
  plot_data2 %>% 
  left_join(comparison_colors_tbl,
            by = c("label" = "name")) %>% 
  group_by(column, label) %>% 
  summarise(miny = min(plot_order) - 0.5,
            maxy = max(plot_order) + 0.5)

ggplot() +
  geom_rect(data = plot_data4, 
           aes(xmin = column, xmax = column, ymin = miny, ymax =  maxy, fill = label), 
           show.legend = F
         ) +
    geom_rect(data = plot_data4 %>% filter (column == "hpa_consensus_name"), 
          # aes(xmin = column, xmax = column, ymin = miny, ymax = maxy, color = color), 
           aes(xmin = 2, xmax = 2.5, ymin = miny, ymax =  maxy, fill = label), 
           show.legend = F
         ) +
    geom_rect(data = plot_data4 %>% filter (column == "comparison_tissue_name"), 
          # aes(xmin = column, xmax = column, ymin = miny, ymax = maxy, color = color), 
           aes(xmin = 0.5, xmax = 1, ymin = miny, ymax =  maxy, fill = label), 
           show.legend = F, width = 10
         ) +
  geom_segment(
    data = plot_data3,
    aes(
      x = "comparison_tissue_name",
      xend = "hpa_consensus_name",
      y = left_pos,
      yend = plot_order,
      color = hpa_consensus_name
    ),
    show.legend = F,
    alpha = 0.5,
    size = 2
  )+
  geom_text(
    data = plot_data2 %>% filter(column == "hpa_consensus_name"),
    aes(x = column, y = plot_order, label = label),
    hjust = 0,
    size = 2 * 5 / 6,
    label.padding = unit(0, "mm")
  ) +
  geom_text(
    data = plot_data2 %>% filter(column == "comparison_tissue_name"),
    aes(x = column, y = plot_order, label = label),
    hjust = 1,
    size = 2 * 5 / 6,
    show.legend = F,
    label.padding = unit(0, "mm")
  ) +
  # geom_label(
  #   data = plot_data2 %>%
  #     filter(column == "organ_name"),
  #   #aes(x = column, y = plot_order, label = label),
  #   aes(x = column, y = plot_order, label = gsub(" ", "\n", label)),
  #   show.legend = F,
  #   label.size = 0,
  #   hjust = 1,
  #   lineheight = 0.7,
  #   label.padding = unit(0, "mm"),
  #   size = 2 * 5 / 6
  # ) +
  scale_y_reverse() +
  scale_x_discrete(labels  = c("Comparison tissue", "Human Tissue"),position = "top") +
  scale_fill_manual(values = pal) +
  scale_color_manual(values = pal) +
  theme_void() +
  theme(axis.text.x = element_text())
ggsave("final_plots/comparison/comparison_tissues_human_rev.pdf", height = 5, width = 4)

```


#Batch Correction
```{r}

#Do batch correction
#join rat and human tmm data together
joined_atlas_comparison_temp <- rat_tissue_comp %>%
  inner_join(homolog_data, by = c("target_id" = "ensrnog_id")) %>%
  unite(mutual_id, target_id, ensg_id, sep = "_") %>%
  mutate(species = "rat") %>%
  select(mutual_id, tissue, species , tmm) %>%
  bind_rows(
    hpa_comp %>%
      inner_join(homolog_data, by = c("Gene" = "ensg_id")) %>%
      unite(mutual_id, ensrnog_id, Gene, sep = "_") %>%
      mutate(species = "human") %>%
      select(mutual_id, tissue, species , tmm)
  )
  #### if tmm is under 0 then it is equal to 0
  #mutate(tmm = ifelse(tmm < 1, 0, tmm ))

#long table with inf of on tissue and species
joined_atlas_comparison_temp_2 <- joined_atlas_comparison_temp %>% 
  unite(id, tissue, species, sep = "_") %>% 
  separate(id, into = c("tissue", "species"), sep = "_", remove = F) 

#wide table only with tmm
joined_atlas_comparison_temp3_tmm <-
  joined_atlas_comparison_temp_2 %>% 
  select(mutual_id, id, tmm) %>% 
  spread(id, tmm) %>% 
  column_to_rownames("mutual_id") 

#log scale
joined_atlas_comparison_temp3_limma <-
  joined_atlas_comparison_temp3_tmm %>%
  {log10(. + 1)} %>%
  limma::removeBatchEffect(batch = colnames(joined_atlas_comparison_temp3_tmm) %>%
                             str_extract("_.*$"))
#put them together in the long format
joined_atlas_comparison<- 
  joined_atlas_comparison_temp_2 %>%
  left_join(joined_atlas_comparison_temp3_tmm %>% 
              as_tibble(rownames = "mutual_id") %>% 
              gather(id, tmm, -1)) %>% 
  left_join(joined_atlas_comparison_temp3_limma %>% 
              as_tibble(rownames = "mutual_id") %>% 
              gather(id, limma_log1p_tmm, -1)) 
```

#Figure 7
##Figure 7A - Cross species dendrogramm
```{r}

hclust4RNAseq <- function(df, correlation_method = "spearman"){
  #wide dataframe as input 
  #to get correlation between samples, where rows are genes columns are samples
  #to get correlation between genes across samples, input df with genes as columns
  #can use later for dendogram making: ggdendrogram([hclust4RNAseq_results], rotate = FALSE, size = 10, face = "bold")
  similarity <- cor(df, method=correlation_method, use="pairwise.complete.obs")
  dissimilarity <- 1 - similarity
  hcl <- hclust(as.dist(dissimilarity), "average")
  return (hcl)
} 


organ_colors <- comp_metadata %>% select(comparison_tissue_name, consensus_tissue_color) %>% drop_na() %>% distinct()
pal <-  organ_colors$consensus_tissue_color
pal <- setNames(pal, str_to_sentence(organ_colors$comparison_tissue_name))

shape_def <- c(21, 22)
shape_def <- setNames(shape_def, c("Human", "Rat"))


plot_comp_dendro_data <- joined_atlas_comparison %>% 
  select(mutual_id, id, limma_log1p_tmm) %>% 
  spread(id, limma_log1p_tmm) %>% 
  column_to_rownames("mutual_id") %>% 
  cor(method = "spearman", use="pairwise.complete.obs") %>%
  {1 - .} %>%
  as.dist() %>% 
  hclust(method = "average") %T>%
  plot %>%
  dendro_data()

dendro_plot_data <- 
  left_join(plot_comp_dendro_data$segments, 
            plot_comp_dendro_data$labels, 
            by = c("x" = "x", "yend" = "y")) %>% 
  separate(label, c("tissue", "species"), sep = "_", remove = FALSE) %>% 
  mutate(tissue = str_to_sentence(tissue), species = str_to_sentence(species)) %>% 
  mutate(species = factor(species, c("Human", "Rat")))

left_plot <- 
  dendro_plot_data %>%
  ggplot() +
  geom_segment(aes(x=y, y=x, xend=yend, yend=xend))+
  # geom_rect(aes(xmin=0, ymin=x + 0.5, 
  #               xmax=-0.02, ymax=xend - 0.5, 
  #               fill = tissue), 
  #           show.legend = F) +
  geom_point(aes(
      x = 0,
      y = x,
      shape = species,
      fill = tissue
    ),
    color = "gray25",
    alpha = 1,
    size = 3,
    stroke = 0.7
  ) +
  geom_text(aes(x=-0.02, y=x,
                label = tissue),
            hjust = 0,
            show.legend = F) +
  scale_shape_manual(values = shape_def ) +
# scale_color_manual(values = pal) +
  scale_fill_manual(values = pal, guide = "none") +
   scale_x_reverse(
     expand = expansion(0.5 ), 
     position = "top")+
   scale_y_continuous(expand = expansion(0.01)) +
  xlab("1 - Spearman's rho") +
  
  theme(axis.text.y = element_blank(), 
        axis.title.y = element_blank(), 
        axis.ticks.y = element_blank(),
        legend.title=element_blank(),
        plot.margin = unit(c(1,1,1,1), units = "mm"), 
        panel.background = element_blank()) 
left_plot
ggsave("./final_plots/comparison/comparison_dendrogram.pdf", width = 6.5, height = 9 )

```
##Figure 7B - Cros species UMAP
```{r}
library(plotly)
library(ggplotify)
library(geomtextpath)

comp_metadata <- read_csv("./data/final_data/comparison_metadata-init-1-0.csv")
umap_meta <- comp_metadata %>% select(consensus_tissue_color, organ_color, comparison_tissue_name) %>% filter(comparison_tissue_name != "" ) %>% distinct()
wide_data <- joined_atlas_comparison %>% 
  select(-tissue, -species, -tmm) %>% 
  spread(id, limma_log1p_tmm)
seed <- 42
filter_zero_sd = F
n_epochs = 1000 
n_neighbors = 15

pca_res <-
  wide_data %>%
  #no log1p because limma value is already calculated from log scale value
  column_to_rownames(colnames(wide_data)[1]) %>%
  scale() %>%
  t() %>%
  pca(nPcs = dim(.)[1])

pc_lim <-
  which(pca_res@R2cum > 0.8)[1]

pc_lim_sd <-
  rev(which(pca_res@sDev > 1))[1]

n_neighbors = 15
set.seed(seed)
umap_res <- pca_res@scores[, 1:pc_lim] %>%
  uwot::umap(n_neighbors = n_neighbors,
             n_epochs = n_epochs) %>%
  as_tibble() %>%
  set_names(paste0("UMAP", 1:ncol(.))) %>%
  mutate(sample = rownames(pca_res@scores)) %>%
  select(sample, everything()) %>%
  separate(
    sample,
    into = c("tissue", "species"),
    sep = "_",
    remove = F
  ) %>%
  left_join(umap_meta, by = c("tissue" = "comparison_tissue_name"))

organ_colors <-
  umap_meta %>% select(comparison_tissue_name, consensus_tissue_color) %>% unique()
pal <-  organ_colors$consensus_tissue_color
pal <- setNames(pal, organ_colors$comparison_tissue_name)

shape_def <- c(21, 22)
shape_def <- setNames(shape_def, c("human", "rat"))


# umap_res %>%  ggplot(aes(UMAP1, UMAP2,  color = organ_name)) +
#    geom_point(alpha = 0.8) + scale_color_manual(values = pal)
p <- umap_res %>%  
  ggplot(aes(UMAP1, UMAP2)) +
  geom_textpath(
    aes(label = str_to_sentence(tissue)),
    hjust = 0.5,
    vjust = -0.3,
    color = "gray20"
  ) +
  geom_point(
    aes(fill = tissue, shape = species),
    color = "gray25",
    alpha = 1,
    size = 2,
    stroke = 0.7
  ) +
  guides(fill = "none") +
  scale_fill_manual(values = pal) +
  scale_shape_manual(values = shape_def) +
  theme_classic() + theme(legend.text = element_text(size = 10),
                          #legend.title =element_blank(), 
                          legend.spacing.y = unit(-0.1, 'cm'),
                          legend.spacing.x = unit(-0.01, 'cm')) +
  guides(shape = guide_legend(ncol = 1, byrow = TRUE, title = "Species"))

p





if (include_many2many == TRUE & include_one2many == TRUE) {
  comp_umap_file_name = paste0("./final_plots/comparison/",
                               ortholog_data_from,
                               "_umap.pdf")
} else if (include_many2many == FALSE & include_one2many == TRUE) {
  comp_umap_file_name = paste0("./final_plots/comparison/",
                               ortholog_data_from,
                               "_umap.pdf")
} else if (include_many2many == FALSE &
           include_one2many == FALSE) {
  comp_umap_file_name = paste0("./final_plots/comparison/",
                               ortholog_data_from,
                               "_umap.pdf")
}


ggsave(comp_umap_file_name, height = 5.5, width = 6.5)

```

##Figure 7C - Cross species hypergeometric test
```{r}
#Based on Max Karlsson: https://github.com/maxkarlsson/Pig-Atlas/blob/master/scripts/functions_classification.R

library(viridis)
library(ggsci)



class1 <- hpa_gene_classification(data = rat_tissue_comp %>% select(target_id, tissue, tmm), expression_col = "tmm", tissue_col = "tissue", gene_col = "target_id", enr_fold = 4, max_group_n = 5, det_lim = 1) %>% rename(ensrnog_id = gene)
class2 <- hpa_gene_classification(data = hpa_comp %>% select(Gene, tissue, tmm), expression_col = "tmm", tissue_col = "tissue", gene_col = "Gene", enr_fold = 4, max_group_n = 5, det_lim = 1) %>% 
  rename(ensg_id = gene)

gene_col1 <- "ensrnog_id"
gene_col2 <- "ensg_id"
sep <- ";"

class1_long <-
  class1 %>%
  select(gene1 = gene_col1, enriched_tissues) %>%
  mutate(
    enriched_tissues = ifelse(is.na(enriched_tissues),
                              "not enriched",
                              enriched_tissues),
    enriched1 = T
  ) %>%
  separate_rows(enriched_tissues, sep = sep)

class2_long <-
  class2 %>%
  select(gene2 = gene_col2, enriched_tissues) %>%
  mutate(
    enriched_tissues = ifelse(is.na(enriched_tissues),
                              "not enriched",
                              enriched_tissues),
    enriched2 = T
  ) %>%
  separate_rows(enriched_tissues, sep = sep)

tis_ <-
  unique(c(class1_long$enriched_tissues,
           class2_long$enriched_tissues)) %>%
  sort()

gene_orthologs <- homolog_data %>% select(1,2)

overlap_hyper_all <- expand.grid(id1 = tis_,
            id2 = tis_,
            gene = 1:nrow(gene_orthologs)) %>%
  as_tibble() %>%
  left_join(gene_orthologs %>%
              select(gene1 = gene_col1,
                     gene2 = gene_col2) %>%
              mutate(gene = row_number())) %>%
  select(-gene) %>%
  left_join(class1_long,
            by = c("gene1", "id1" = "enriched_tissues")) %>%
  left_join(class2_long,
            by = c("gene2", "id2" = "enriched_tissues")) %>%
  mutate(
    enriched1 = ifelse(is.na(enriched1),
                       F,
                       enriched1),
    enriched2 = ifelse(is.na(enriched2),
                       F,
                       enriched2)
  ) %>%
  group_by(id1, id2, enriched1, enriched2) %>%
  count() %>%
  group_by(id1, id2) %>%
  summarise(
    # q is the number of successes
    q = sum(n[which(enriched1 & enriched2)]),
    
    # k is the number of tries - i.e. the number of genes that are elevated for either species
    k = sum(n[which(enriched1 | enriched2)]),
    
    # m is the number of possible successes - i.e. the number of genes that are elevated for either
    m = min(sum(n[which(enriched1)]),
            sum(n[which(enriched2)])),
    
    # n is the population size - i.e. the number of genes
    n = sum(n) - m
  ) %>% 
  mutate(p_value = phyper(q - 1, m, n, k, lower.tail = F)) %>%
  mutate(p_value = ifelse(p_value == 0, .Machine$double.xmin, p_value),
         adj_pval = p.adjust(p_value, method = "BH")) %>%
  rename(rat_id = id1, 
         human_id = id2)

overlap_hyper_all %>% 
  write_csv("./data/final_data/rat_human_class_hyper.csv")


plot_order <- 
  overlap_hyper_all %>% 
  ungroup() %>%
  mutate(rat_id = str_to_sentence(rat_id),
         human_id = str_to_sentence(human_id)) %>%
  filter(rat_id == human_id) %>%
  arrange(adj_pval) %>%
  pull(rat_id)


stripped_theme <-
  theme(panel.background = element_rect(fill = NA, colour = NA),
        plot.background = element_rect(fill = NA, color = NA),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        legend.key = element_rect(colour = NA),
        #legend.position = "bottom",
        #legend.direction = "horizontal",
        legend.key.size= unit(0.3, "cm"),
        legend.title = element_text(face="italic"),
        axis.line = element_line(colour="black",size=0.5))


overlap_hyper_all %>% 
  group_by(rat_id, human_id) %>%
  mutate(capped_p = min(c(-log10(adj_pval), 20))) %>%
  ungroup() %>% 
  mutate(rat_id = str_to_sentence(rat_id),
         human_id = str_to_sentence(human_id)) %>%
  mutate(rat_id = factor(rat_id, plot_order),
         human_id = factor(human_id, plot_order)) %>%
  ggplot(aes(human_id, rat_id, fill = capped_p)) +
  geom_tile() + 
  geom_tile(data = . %>% 
              filter(adj_pval >= 0.05),
            fill = "black") + 
  scale_fill_viridis(option = "D", direction = 1, 
                     name = "Adjusted p-value") + 
  stripped_theme +
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.2),
        legend.position = "top") +
  xlab("Human") + 
  ylab("Rat")

#ggsave("./final_plots/comparison/Overlap hyper heatmap elevated.pdf", width = 6, height = 6)  

overlap_hyper_all %>% 
  filter(human_id != "not enriched" & 
           rat_id != "not enriched") %>%
  ungroup() %>%
  mutate(rat_id = str_to_sentence(rat_id),
         human_id = str_to_sentence(human_id)) %>%
  filter(adj_pval < 0.05) %>%
  mutate(rat_id = factor(rat_id, plot_order),
         human_id = factor(human_id, plot_order),
         adj_pval = case_when(adj_pval < 1e-100 ~ 1e-100,
                              T ~ adj_pval)) %>%
  ggplot(aes(human_id, rat_id, fill = -log10(adj_pval), size = -log10(adj_pval))) +
  geom_point(shape = 21, 
             alpha = 0.8,
             stroke = 0.2,
             color = "black") + 
  # scale_color_viridis(option = "E", direction = 1, 
  #                    name = "Adjusted p-value") + 
  scale_fill_gradientn(colors = ggsci::rgb_material(palette = "deep-orange")) +
  scale_size_continuous(range = c(1, 6)) +
  stripped_theme +
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust = 0.2),
        legend.position = "top", 
        panel.grid.major = element_line(color = "gray90", size = 0.2)) +
  xlab("Human") + 
  ylab("Rat")

ggsave("./final_plots/comparison/hypergeometric_comparison_tissue.pdf", width = 5, height = 5.5)  

```

##Figure 7D - Cross species gene annotation alluvial
```{r}

#classification rat
classification_rat_comp <-
  hpa_gene_classification(
    data = joined_atlas_comparison %>% filter(species == "rat") %>% select(c(-id,-species,-limma_log1p_tmm)),
    expression_col = "tmm",
    tissue_col = "tissue",
    gene_col = "mutual_id",
    enr_fold = 4,
    max_group_n = 5,
    det_lim = 1
  )

rat_comp_tau <- calculate_tau_score(
  joined_atlas_comparison %>% filter(species == "rat") %>% select(-id,-species,-limma_log1p_tmm)  %>%
    spread(tissue, tmm) %>%
    mutate_if(is.numeric, function(x) {
      log10(x + 1)
    }) %>%
    column_to_rownames("mutual_id")
)

classification_rat_comp <- classification_rat_comp %>%
  left_join(rat_comp_tau, by = c("gene" = "gene")) %>%
  mutate(tau_score = ifelse(spec_category == "not detected", NA, tau_score)) %>%
  mutate(species = "Rat")


#classification human

classification_human_comp <-
  hpa_gene_classification(
    data = joined_atlas_comparison %>% filter(species == "human") %>% select(c(-id,-species,-limma_log1p_tmm)),
    expression_col = "tmm",
    tissue_col = "tissue",
    gene_col = "mutual_id",
    enr_fold = 4,
    max_group_n = 5,
    det_lim = 1
  )

human_comp_tau <- calculate_tau_score(
  joined_atlas_comparison %>% filter(species == "human") %>% select(-id,-species,-limma_log1p_tmm)  %>%
    spread(tissue, tmm) %>%
    mutate_if(is.numeric, function(x) {
      log10(x + 1)
    }) %>%
    column_to_rownames("mutual_id")
)

classification_human_comp <- classification_human_comp %>%
  left_join(rat_comp_tau, by = c("gene" = "gene")) %>%
  mutate(tau_score = ifelse(spec_category == "not detected", NA, tau_score)) %>%
  mutate(species = "Human")

```


```{r}
alluvial_data_comp <- classification_rat_comp %>%
  select(gene, spec_category) %>%
  rename(Rat = spec_category) %>%
  left_join(
    classification_human_comp %>%
      select(gene, spec_category) %>%
      rename(Human = spec_category), 
    by = "gene"
  )


width = 0.1

alluv_1 <-
  
  alluvial_data_comp %>%
  mutate(Rat = str_to_sentence(Rat),
          Human = str_to_sentence(Human)) %>%
  select(Rat, 
         Human) %>%
  mutate(row_n = row_number()) %>%
  gather(bar, chunk, -row_n) %>%
  mutate(color_vars = 1) %>%
  group_by(row_n) %>%
  mutate(chunk_color = chunk[match(c("Human",
                                     "Rat")[color_vars], bar)]) %>% 
  ungroup() %>%
  
  
  mutate(chunk = factor(chunk, levels = c('Tissue enriched', 'Group enriched', 
                                          'Tissue enhanced', 'Low tissue specificity', 
                                          'Not detected')),
         bar = factor(bar, levels = c("Human",
                                      "Rat"))) %>%
  
  
  ggplot(aes(x = bar, stratum = chunk, alluvium = row_n,
             y = 1)) +
  
  geom_flow(aes(fill = chunk_color), 
            show.legend = F, width = width,
            knot.pos = 1/6) +
  geom_stratum(aes(fill = chunk), 
               show.legend = F, color = NA, width = width) +
  
  scale_x_discrete(expand = c(.1, .1), position = "top") +
  scale_fill_manual(values = c(gene_category_pal)) + 
  
  
  theme(axis.text.y = element_text(size = 18, face = "bold"),
        axis.text.x = element_blank(), 
        axis.ticks = element_blank(), 
        panel.background = element_blank(), 
        axis.title = element_blank()) +
  coord_flip()

# alluv_1

flow_data <-
  ggplot_build(alluv_1)$data[[1]] %>%
  as_tibble() %>%
  {
    if ("side" %in% names(.)) {
      .
    } else{
      mutate(.,
             side = case_when(flow == "from" ~ "start",
                              flow == "to" ~ "end"))
    }
  }



stratum_data <- 
  ggplot_build(alluv_1)$data[[2]]

flow_data_labels <-
  flow_data %>%
  as_tibble() %>%
  select(x, stratum, group, side, ymin, ymax) %>%
  pivot_wider(names_from = side,
              values_from = c(x, stratum, ymin, ymax)) %>%
  mutate_at(
    c(
      "x_end",
      "ymax_end",
      "ymin_end",
      "x_start",
      "ymax_start",
      "ymin_start"
    ),
    as.numeric
  ) %>%
  group_by(stratum_start, stratum_end, x_start, x_end) %>%
  summarise(
    y_end = (min(ymin_end) + max(ymax_end)) / 2,
    y_start = (min(ymin_start) + max(ymax_start)) / 2,
    size = max(ymax_start) - min(ymin_start)
  )

alluv_2 <- 
  alluv_1 +
  geom_text(data = flow_data_labels,
            aes(x = x_start + width/2,
                y = y_start, 
                label = size), 
            inherit.aes = F, 
            size = 3,
            angle = -90,
            hjust = 1,
            vjust = 0.5) +
  geom_text(data = flow_data_labels,
            aes(x = x_end - width/2,
                y = y_end, 
                label = size), 
            inherit.aes = F, 
            size = 3, 
            angle = -90,
            hjust = 0,
            vjust = 0.5) +
  
  # Stratum label
  
  geom_text(data = stratum_data %>%
              filter(x == 1),
            aes(x = x - width/2,
                y = y,
                label = stratum),
            size = 4, 
            vjust = 1.5,
            inherit.aes = F) + 
  geom_text(data = stratum_data %>%
              filter(x == 2),
            aes(x = x + width/2,
                y = y,
                label = stratum),
            size = 4, 
            vjust = -0.5,
            inherit.aes = F) + 
  
  geom_text(data = stratum_data,
            aes(x = x, 
                y = y,
                label = ymax - ymin), 
            size = 4, 
            fontface = "bold",
            color = "white",
            inherit.aes = F)


alluv_2

if (include_many2many == TRUE & include_one2many == TRUE){
  comp_alluv_file_name = paste0("./final_plots/comparison/",ortholog_data_from,"_comparison_all_alluvial.pdf")
} else if (include_many2many == FALSE & include_one2many == TRUE){
  comp_alluv_file_name = paste0("./final_plots/comparison/",ortholog_data_from,"_comparison_only_on2one2many_alluvial.pdf")
} else if (include_many2many == FALSE & include_one2many == FALSE){
  comp_alluv_file_name = paste0("./final_plots/comparison/",ortholog_data_from,"_comparison_only_one2one_alluvial.pdf")}

ggsave(comp_alluv_file_name,width = 8, height = 3)

```

#Figure S1 - Brain Metadata alluvial plot
```{r}
#Function adapted from Max Karlsson
multi_alluvial_plot <-
  function(data,
           vars,
           chunk_levels,
           pal,
           color_by = c(1, 3, 3)) {
    selvars = vars
    
    if (!is.null(names(vars))) {
      vars = names(vars)
    }
    
    alluv_1 <-
      data %>%
      ungroup() %>%
      select(selvars) %>%
      ungroup() %>%
      mutate(row_n = row_number()) %>%
      gather(bar, chunk,-row_n) %>%
      left_join(tibble(bar = vars,
                       color_vars = color_by),
                by = "bar") %>%
      group_by(row_n) %>%
      mutate(chunk_color = chunk[match(vars[color_vars], bar)]) %>%
      ungroup() %>%
      
      mutate(chunk = factor(chunk, levels = chunk_levels),
             bar = factor(bar, levels = vars)) %>%
      
      
      ggplot(aes(
        x = bar,
        stratum = chunk,
        alluvium = row_n,
        y = 1
      )) +
      
      geom_flow(aes(fill = chunk_color),
                show.legend = F) +
      geom_stratum(aes(fill = chunk),
                   show.legend = F, color = NA) +
      
      scale_x_discrete(expand = c(.1, .1), position = "top") +
       scale_fill_manual(values = pal) +


      theme(
        axis.text.x = element_text(size = 18, face = "bold"),
        axis.text.y = element_blank(),
        axis.ticks = element_blank(),
        panel.background = element_blank(),
        axis.title = element_blank()
      )


    
    
    flow_data <-
      ggplot_build(alluv_1)$data[[1]] %>%
      as_tibble() %>%
      {
        if ("side" %in% names(.)) {
          .
        } else{
          mutate(.,
                 side = case_when(flow == "from" ~ "start",
                                  flow == "to" ~ "end"))
        }
      }
    
    
    stratum_data <-
      ggplot_build(alluv_1)$data[[2]]
    
    flow_data_labels <-
      flow_data %>%
      as_tibble() %>%
      
      select(x, stratum, group, side, ymin, ymax) %>%
      pivot_wider(names_from = side,
                  values_from = c(x, stratum, ymin, ymax)) %>%
      
      mutate_at(
        c(
          "x_end",
          "ymax_end",
          "ymin_end",
          "x_start",
          "ymax_start",
          "ymin_start"
        ),
        as.numeric
      ) %>%
      group_by(stratum_start, stratum_end, x_start, x_end) %>%
      summarise(
        y_end = (min(ymin_end) + max(ymax_end)) / 2,
        y_start = (min(ymin_start) + max(ymax_start)) / 2,
        size = max(ymax_start) - min(ymin_start)
      )
    
    alluv_1 <-
      alluv_1 +
      # geom_text(
      #   data = flow_data_labels,
      #   aes(x = x_start + 1 / 6,
      #       y = y_start,
      #       label = size),
      #   inherit.aes = F,
      #   size = 3,
      #   hjust = 0
      # ) +
      # geom_text(
      #   data = flow_data_labels,
      #   aes(x = x_end - 1 / 6,
      #       y = y_end,
      #       label = size),
      #   inherit.aes = F,
      #   size = 3,
      #   hjust = 1
      # ) +
      # 
      # Stratum label
      geom_text(
        data = stratum_data,
        aes(
          x = x,
          y = y,
          label = paste(stratum#, 
                       # paste("[", ymax - ymin, "]", sep = "")
                        )
        ),
        size = 4,
        inherit.aes = F
      )
                
    
    alluv_1
  }
```

```{r}
metadata_b <- metadata %>% filter(organ_name =="Brain")

t_names <- metadata_b$tissue_name %>% unique()
r_names <- metadata_b$region_tissue_name %>% unique()
c_names <- metadata_b$consensus_tissue_name %>% unique()
o_names <- metadata_b$organ_name %>% unique()

t_colors <- metadata_b %>% select(tissue_name, tissue_color) %>% unique() %>% rename (chunk = tissue_name, color = tissue_color)
r_colors <- metadata_b %>% select(region_tissue_name, region_tissue_color) %>% unique() %>% rename (chunk = region_tissue_name, color = region_tissue_color)
c_colors <- metadata_b %>% select(consensus_tissue_name, consensus_tissue_color) %>% unique() %>% rename (chunk = consensus_tissue_name, color = consensus_tissue_color)
o_colors <- metadata_b %>% select(organ_name, organ_color) %>% unique() %>% rename (chunk = organ_name, color = organ_color)

bind_colors <- bind_rows(t_colors, r_colors, o_colors) %>% unique() %>% arrange(chunk) %>% mutate(row_n = row_number())
pal <-  bind_colors$color
pal <- setNames(pal, bind_colors$chunk)

data = metadata_b
vars = c("tissue_name", "region_tissue_name", "organ_name")
chunk_levels = c(t_names, r_names, o_names) %>% unique()
color_by = c(1, 3, 3)

multi_alluvial_plot(data = metadata_b, vars = vars, chunk_levels = chunk_levels, pal = pal, color_by = c(1, 3, 3))

ggsave("final_plots/alluvial/brain-tco-1_p.pdf", height = 7, width = 10)
```


#Figure S2 - Normalisation comparison TPM vs nTPM (TMM)
```{r}
tpm_sample <-read_csv("./data/final_data/curated_pTPM_rattus_norvegicus_v103.csv")


sample_subset_ID <- metadata[match(unique(metadata$consensus_tissue_name), metadata$consensus_tissue_name),] %>% select(ID)
sample_subset_tmm <- tmm_sample %>% select(target_id, sample_subset_ID$ID) %>% gather(sample, tmm, -1)
sample_subset_tpm <- tpm_sample %>% select(target_id, sample_subset_ID$ID) %>% gather(sample, tpm, -1)

# plot_data <- left_join(sample_subset_tmm, sample_subset_tpm, by = c("target_id", "sample")) %>%
#   mutate(log1p_tmm = log10(tmm + 1), log1p_tpm = log10(tpm +1)) %>% select(-tmm, -tpm) %>% 
#   gather(expression_type, expression, -1, -2) %>% 
#   left_join(metadata %>% select(ID, tissue_name, consensus_tissue_name), by = c("sample" = "ID"))

plot_data <- left_join(sample_subset_tmm, sample_subset_tpm, by = c("target_id", "sample")) %>%
  gather(expression_type, expression, -1, -2) %>% mutate_if(is.numeric, function(x) {
                log10(x + 1)
              }) %>% 
  left_join(metadata %>% select(ID, tissue_name, consensus_tissue_name), by = c("sample" = "ID"))


pal_tbl <- metadata %>% select(consensus_tissue_name, consensus_tissue_color) %>% distinct()
pal <- pal_tbl %>% pull(consensus_tissue_color)
pal <- setNames(pal, pal_tbl %>% pull(consensus_tissue_name))


ggplot(data = plot_data, aes(x = expression, y =sample, fill = consensus_tissue_name)) +
  geom_boxplot(draw_quantiles = 0.5, outlier.size = 0.5, outlier.alpha = 0.3)+
  facet_wrap(~expression_type)+
  scale_fill_manual(values = pal) +
  theme(legend.position = "none", 
        axis.title = element_blank())

ggsave("./final_plots/tpm_tmm_comp_boxplot.pdf", width=7, height = 12)

```

#Figure S3 - Spearman heatmap (tissye type level)
```{r}

##Spearman's roh heatmap at tissue type level
if(file.exists("./data/final_data/spearman_corr_tissues.csv")) {
  tissue_tmm_spearman <- read_csv("./data/final_data/spearman_corr_tissues.csv")
} else {
  tissue_tmm_spearman <-  tmm_tissue %>%
    spread(tissue_name, tmm) %>%
    column_to_rownames("target_id") %>%
    cor(method = "spearman", use = "pairwise.complete.obs") %>%
    as.data.frame() %>%
    as_tibble(rownames = "tissue_name")
  write_csv(as.data.frame(tissue_tmm_spearman) %>% as_tibble(rownames = "tissue_name"),"./data/final_data/spearman_corr_tissues.csv")
}

tissue_tmm_spearman %>% 
  column_to_rownames("tissue_name") %>% 
  pheatmap(
   # clustering_method = "ward.D2",
    cellheight = 8,
    cellwidth = 8, 
    border_color = NA,
    color = viridis::inferno(20, direction = -1),
    show_rownames = FALSE, 
    ) %>% 
  as.ggplot()

ggsave("./final_plots/data_presentation/spearman_corr_tissue.pdf", height = 20, width = 20)


```

#Figure S4 - Comparison Pheatmap
```{r}
if(file.exists("./data/final_data/spearman_corr_tissues.csv")) {
  tissue_tmm_spearman <- read_csv("./data/final_data/spearman_corr_tissues.csv")
} else {
  tissue_tmm_spearman <-  tmm_tissue %>%
    spread(tissue_name, tmm) %>%
    column_to_rownames("target_id") %>%
    cor(method = "spearman", use = "pairwise.complete.obs") %>%
    as.data.frame() %>%
    as_tibble(rownames = "tissue_name")
  write_csv(as.data.frame(tissue_tmm_spearman) %>% as_tibble(rownames = "tissue_name"),"./data/final_data/spearman_corr_tissues.csv")
}



joined_atlas_comparison_pheat_data <-  joined_atlas_comparison %>%
  select(mutual_id, id,limma_log1p_tmm ) %>%  
  spread(id, limma_log1p_tmm) %>% 
  column_to_rownames("mutual_id") %>% 
  cor(method = "spearman", use = "pairwise.complete.obs") %>%
  as.data.frame() %>%
  as_tibble(rownames = "tissue_name")

joined_atlas_comparison_pheat_data %>% 
  column_to_rownames("tissue_name") %>% 
  pheatmap(
   # clustering_method = "ward.D2",
    cellheight = 8,
    cellwidth = 8, 
    border_color = NA,
    color = viridis::inferno(20, direction = -1),
    show_rownames = FALSE, 
    ) %>% 
  as.ggplot()

ggsave("./final_plots/comparison/pheatmap.pdf", width = 20, height = 20)

```
```{r}
sessionInfo()

```

